1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2 ; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck --check-prefix=EG %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
9 define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
10 ; EG-LABEL: v_test_imin_sle_i32:
12 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
14 ; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
15 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
18 ; EG-NEXT: Fetch clause starting at 6:
19 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
20 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
21 ; EG-NEXT: ALU clause starting at 10:
22 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
23 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
24 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
25 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
26 ; EG-NEXT: ALU clause starting at 14:
27 ; EG-NEXT: MIN_INT T0.X, T0.X, T1.X,
28 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
29 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
30 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
32 ; CI-LABEL: v_test_imin_sle_i32:
34 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
35 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
36 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
37 ; CI-NEXT: s_waitcnt lgkmcnt(0)
38 ; CI-NEXT: v_mov_b32_e32 v1, s3
39 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
40 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
41 ; CI-NEXT: v_mov_b32_e32 v3, s5
42 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
43 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
44 ; CI-NEXT: flat_load_dword v5, v[0:1]
45 ; CI-NEXT: flat_load_dword v2, v[2:3]
46 ; CI-NEXT: v_mov_b32_e32 v1, s1
47 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
48 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
49 ; CI-NEXT: s_waitcnt vmcnt(0)
50 ; CI-NEXT: v_min_i32_e32 v2, v5, v2
51 ; CI-NEXT: flat_store_dword v[0:1], v2
54 ; VI-LABEL: v_test_imin_sle_i32:
56 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
57 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
58 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
59 ; VI-NEXT: s_waitcnt lgkmcnt(0)
60 ; VI-NEXT: v_mov_b32_e32 v1, s3
61 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
62 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
63 ; VI-NEXT: v_mov_b32_e32 v3, s5
64 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
65 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
66 ; VI-NEXT: flat_load_dword v5, v[0:1]
67 ; VI-NEXT: flat_load_dword v2, v[2:3]
68 ; VI-NEXT: v_mov_b32_e32 v1, s1
69 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
70 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
71 ; VI-NEXT: s_waitcnt vmcnt(0)
72 ; VI-NEXT: v_min_i32_e32 v2, v5, v2
73 ; VI-NEXT: flat_store_dword v[0:1], v2
76 ; GFX9-LABEL: v_test_imin_sle_i32:
78 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
79 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
80 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
81 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
82 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
83 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
84 ; GFX9-NEXT: s_waitcnt vmcnt(0)
85 ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2
86 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
89 ; GFX10-LABEL: v_test_imin_sle_i32:
91 ; GFX10-NEXT: s_clause 0x1
92 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
93 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
94 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
95 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX10-NEXT: s_clause 0x1
97 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
98 ; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
99 ; GFX10-NEXT: s_waitcnt vmcnt(0)
100 ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2
101 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
102 ; GFX10-NEXT: s_endpgm
104 ; GFX11-LABEL: v_test_imin_sle_i32:
106 ; GFX11-NEXT: s_clause 0x1
107 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
108 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
109 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
110 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
111 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
112 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
113 ; GFX11-NEXT: s_clause 0x1
114 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
115 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1]
116 ; GFX11-NEXT: s_waitcnt vmcnt(0)
117 ; GFX11-NEXT: v_min_i32_e32 v1, v1, v2
118 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
119 ; GFX11-NEXT: s_nop 0
120 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
121 ; GFX11-NEXT: s_endpgm
122 %tid = call i32 @llvm.amdgcn.workitem.id.x()
123 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
124 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
125 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
126 %a = load i32, ptr addrspace(1) %a.gep, align 4
127 %b = load i32, ptr addrspace(1) %b.gep, align 4
128 %cmp = icmp sle i32 %a, %b
129 %val = select i1 %cmp, i32 %a, i32 %b
130 store i32 %val, ptr addrspace(1) %out.gep, align 4
134 define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
135 ; EG-LABEL: s_test_imin_sle_i32:
137 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
138 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
141 ; EG-NEXT: ALU clause starting at 4:
142 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
143 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
144 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
146 ; CI-LABEL: s_test_imin_sle_i32:
148 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
149 ; CI-NEXT: s_waitcnt lgkmcnt(0)
150 ; CI-NEXT: s_min_i32 s2, s2, s3
151 ; CI-NEXT: v_mov_b32_e32 v0, s0
152 ; CI-NEXT: v_mov_b32_e32 v1, s1
153 ; CI-NEXT: v_mov_b32_e32 v2, s2
154 ; CI-NEXT: flat_store_dword v[0:1], v2
157 ; VI-LABEL: s_test_imin_sle_i32:
159 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
160 ; VI-NEXT: s_waitcnt lgkmcnt(0)
161 ; VI-NEXT: s_min_i32 s2, s2, s3
162 ; VI-NEXT: v_mov_b32_e32 v0, s0
163 ; VI-NEXT: v_mov_b32_e32 v1, s1
164 ; VI-NEXT: v_mov_b32_e32 v2, s2
165 ; VI-NEXT: flat_store_dword v[0:1], v2
168 ; GFX9-LABEL: s_test_imin_sle_i32:
170 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
171 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
172 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
173 ; GFX9-NEXT: s_min_i32 s2, s2, s3
174 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
175 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
176 ; GFX9-NEXT: s_endpgm
178 ; GFX10-LABEL: s_test_imin_sle_i32:
180 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
181 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
182 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX10-NEXT: s_min_i32 s2, s2, s3
184 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
185 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
186 ; GFX10-NEXT: s_endpgm
188 ; GFX11-LABEL: s_test_imin_sle_i32:
190 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
191 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
192 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
193 ; GFX11-NEXT: s_min_i32 s2, s2, s3
194 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
195 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
196 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
197 ; GFX11-NEXT: s_nop 0
198 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
199 ; GFX11-NEXT: s_endpgm
200 %cmp = icmp sle i32 %a, %b
201 %val = select i1 %cmp, i32 %a, i32 %b
202 store i32 %val, ptr addrspace(1) %out, align 4
206 define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
207 ; EG-LABEL: s_test_imin_sle_v1i32:
209 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
210 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
213 ; EG-NEXT: ALU clause starting at 4:
214 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
215 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
216 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
218 ; CI-LABEL: s_test_imin_sle_v1i32:
220 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
221 ; CI-NEXT: s_waitcnt lgkmcnt(0)
222 ; CI-NEXT: s_min_i32 s2, s2, s3
223 ; CI-NEXT: v_mov_b32_e32 v0, s0
224 ; CI-NEXT: v_mov_b32_e32 v1, s1
225 ; CI-NEXT: v_mov_b32_e32 v2, s2
226 ; CI-NEXT: flat_store_dword v[0:1], v2
229 ; VI-LABEL: s_test_imin_sle_v1i32:
231 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
232 ; VI-NEXT: s_waitcnt lgkmcnt(0)
233 ; VI-NEXT: s_min_i32 s2, s2, s3
234 ; VI-NEXT: v_mov_b32_e32 v0, s0
235 ; VI-NEXT: v_mov_b32_e32 v1, s1
236 ; VI-NEXT: v_mov_b32_e32 v2, s2
237 ; VI-NEXT: flat_store_dword v[0:1], v2
240 ; GFX9-LABEL: s_test_imin_sle_v1i32:
242 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
243 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
244 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
245 ; GFX9-NEXT: s_min_i32 s2, s2, s3
246 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
247 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
248 ; GFX9-NEXT: s_endpgm
250 ; GFX10-LABEL: s_test_imin_sle_v1i32:
252 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
253 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
254 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
255 ; GFX10-NEXT: s_min_i32 s2, s2, s3
256 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
257 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
258 ; GFX10-NEXT: s_endpgm
260 ; GFX11-LABEL: s_test_imin_sle_v1i32:
262 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
263 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
264 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
265 ; GFX11-NEXT: s_min_i32 s2, s2, s3
266 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
267 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
268 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
269 ; GFX11-NEXT: s_nop 0
270 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
271 ; GFX11-NEXT: s_endpgm
272 %cmp = icmp sle <1 x i32> %a, %b
273 %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
274 store <1 x i32> %val, ptr addrspace(1) %out
278 define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b) #0 {
279 ; EG-LABEL: s_test_imin_sle_v4i32:
281 ; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
282 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
285 ; EG-NEXT: ALU clause starting at 4:
286 ; EG-NEXT: MIN_INT * T0.W, KC0[4].X, KC0[5].X,
287 ; EG-NEXT: MIN_INT * T0.Z, KC0[3].W, KC0[4].W,
288 ; EG-NEXT: MIN_INT * T0.Y, KC0[3].Z, KC0[4].Z,
289 ; EG-NEXT: MIN_INT * T0.X, KC0[3].Y, KC0[4].Y,
290 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
291 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
293 ; CI-LABEL: s_test_imin_sle_v4i32:
295 ; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4
296 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
297 ; CI-NEXT: s_waitcnt lgkmcnt(0)
298 ; CI-NEXT: s_min_i32 s2, s11, s15
299 ; CI-NEXT: s_min_i32 s3, s10, s14
300 ; CI-NEXT: s_min_i32 s4, s9, s13
301 ; CI-NEXT: s_min_i32 s5, s8, s12
302 ; CI-NEXT: v_mov_b32_e32 v5, s1
303 ; CI-NEXT: v_mov_b32_e32 v0, s5
304 ; CI-NEXT: v_mov_b32_e32 v1, s4
305 ; CI-NEXT: v_mov_b32_e32 v2, s3
306 ; CI-NEXT: v_mov_b32_e32 v3, s2
307 ; CI-NEXT: v_mov_b32_e32 v4, s0
308 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
311 ; VI-LABEL: s_test_imin_sle_v4i32:
313 ; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10
314 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
315 ; VI-NEXT: s_waitcnt lgkmcnt(0)
316 ; VI-NEXT: s_min_i32 s2, s11, s15
317 ; VI-NEXT: s_min_i32 s3, s10, s14
318 ; VI-NEXT: s_min_i32 s4, s9, s13
319 ; VI-NEXT: s_min_i32 s5, s8, s12
320 ; VI-NEXT: v_mov_b32_e32 v5, s1
321 ; VI-NEXT: v_mov_b32_e32 v0, s5
322 ; VI-NEXT: v_mov_b32_e32 v1, s4
323 ; VI-NEXT: v_mov_b32_e32 v2, s3
324 ; VI-NEXT: v_mov_b32_e32 v3, s2
325 ; VI-NEXT: v_mov_b32_e32 v4, s0
326 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
329 ; GFX9-LABEL: s_test_imin_sle_v4i32:
331 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10
332 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
333 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
334 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
335 ; GFX9-NEXT: s_min_i32 s2, s11, s15
336 ; GFX9-NEXT: s_min_i32 s3, s10, s14
337 ; GFX9-NEXT: s_min_i32 s4, s9, s13
338 ; GFX9-NEXT: s_min_i32 s5, s8, s12
339 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
340 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
341 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
342 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
343 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
344 ; GFX9-NEXT: s_endpgm
346 ; GFX10-LABEL: s_test_imin_sle_v4i32:
348 ; GFX10-NEXT: s_clause 0x1
349 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10
350 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
351 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
352 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
353 ; GFX10-NEXT: s_min_i32 s2, s11, s15
354 ; GFX10-NEXT: s_min_i32 s3, s10, s14
355 ; GFX10-NEXT: s_min_i32 s4, s8, s12
356 ; GFX10-NEXT: s_min_i32 s5, s9, s13
357 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
358 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
359 ; GFX10-NEXT: v_mov_b32_e32 v2, s3
360 ; GFX10-NEXT: v_mov_b32_e32 v3, s2
361 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
362 ; GFX10-NEXT: s_endpgm
364 ; GFX11-LABEL: s_test_imin_sle_v4i32:
366 ; GFX11-NEXT: s_clause 0x1
367 ; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10
368 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
369 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
370 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
371 ; GFX11-NEXT: s_min_i32 s2, s7, s11
372 ; GFX11-NEXT: s_min_i32 s3, s6, s10
373 ; GFX11-NEXT: s_min_i32 s4, s4, s8
374 ; GFX11-NEXT: s_min_i32 s5, s5, s9
375 ; GFX11-NEXT: v_mov_b32_e32 v0, s4
376 ; GFX11-NEXT: v_mov_b32_e32 v1, s5
377 ; GFX11-NEXT: v_mov_b32_e32 v2, s3
378 ; GFX11-NEXT: v_mov_b32_e32 v3, s2
379 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
380 ; GFX11-NEXT: s_nop 0
381 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
382 ; GFX11-NEXT: s_endpgm
383 %cmp = icmp sle <4 x i32> %a, %b
384 %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
385 store <4 x i32> %val, ptr addrspace(1) %out
389 define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 {
390 ; EG-LABEL: s_test_imin_sle_i8:
392 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
394 ; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
395 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
398 ; EG-NEXT: Fetch clause starting at 6:
399 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 72, #3
400 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 108, #3
401 ; EG-NEXT: ALU clause starting at 10:
402 ; EG-NEXT: MOV * T0.X, 0.0,
403 ; EG-NEXT: ALU clause starting at 11:
404 ; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
405 ; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
406 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
407 ; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
408 ; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
409 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
410 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
411 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
412 ; EG-NEXT: LSHL T0.X, PV.W, PS,
413 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
414 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
415 ; EG-NEXT: MOV T0.Y, 0.0,
416 ; EG-NEXT: MOV * T0.Z, 0.0,
417 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
418 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
420 ; CI-LABEL: s_test_imin_sle_i8:
422 ; CI-NEXT: s_load_dword s2, s[6:7], 0xa
423 ; CI-NEXT: s_load_dword s3, s[6:7], 0x13
424 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
425 ; CI-NEXT: s_waitcnt lgkmcnt(0)
426 ; CI-NEXT: s_sext_i32_i8 s2, s2
427 ; CI-NEXT: s_sext_i32_i8 s3, s3
428 ; CI-NEXT: s_min_i32 s2, s2, s3
429 ; CI-NEXT: v_mov_b32_e32 v0, s0
430 ; CI-NEXT: v_mov_b32_e32 v1, s1
431 ; CI-NEXT: v_mov_b32_e32 v2, s2
432 ; CI-NEXT: flat_store_byte v[0:1], v2
435 ; VI-LABEL: s_test_imin_sle_i8:
437 ; VI-NEXT: s_load_dword s2, s[6:7], 0x28
438 ; VI-NEXT: s_load_dword s3, s[6:7], 0x4c
439 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
440 ; VI-NEXT: s_waitcnt lgkmcnt(0)
441 ; VI-NEXT: s_sext_i32_i8 s2, s2
442 ; VI-NEXT: s_sext_i32_i8 s3, s3
443 ; VI-NEXT: s_min_i32 s2, s2, s3
444 ; VI-NEXT: v_mov_b32_e32 v0, s0
445 ; VI-NEXT: v_mov_b32_e32 v1, s1
446 ; VI-NEXT: v_mov_b32_e32 v2, s2
447 ; VI-NEXT: flat_store_byte v[0:1], v2
450 ; GFX9-LABEL: s_test_imin_sle_i8:
452 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28
453 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c
454 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
455 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
456 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
457 ; GFX9-NEXT: s_sext_i32_i8 s2, s2
458 ; GFX9-NEXT: s_sext_i32_i8 s3, s3
459 ; GFX9-NEXT: s_min_i32 s2, s2, s3
460 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
461 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
462 ; GFX9-NEXT: s_endpgm
464 ; GFX10-LABEL: s_test_imin_sle_i8:
466 ; GFX10-NEXT: s_clause 0x2
467 ; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28
468 ; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c
469 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
470 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
471 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
472 ; GFX10-NEXT: s_sext_i32_i8 s2, s2
473 ; GFX10-NEXT: s_sext_i32_i8 s3, s3
474 ; GFX10-NEXT: s_min_i32 s2, s2, s3
475 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
476 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
477 ; GFX10-NEXT: s_endpgm
479 ; GFX11-LABEL: s_test_imin_sle_i8:
481 ; GFX11-NEXT: s_clause 0x2
482 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28
483 ; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c
484 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
485 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
486 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
487 ; GFX11-NEXT: s_sext_i32_i8 s2, s4
488 ; GFX11-NEXT: s_sext_i32_i8 s3, s5
489 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
490 ; GFX11-NEXT: s_min_i32 s2, s2, s3
491 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
492 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1]
493 ; GFX11-NEXT: s_nop 0
494 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
495 ; GFX11-NEXT: s_endpgm
496 %cmp = icmp sle i8 %a, %b
497 %val = select i1 %cmp, i8 %a, i8 %b
498 store i8 %val, ptr addrspace(1) %out
502 ; FIXME: Why vector and sdwa for last element?
504 define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 {
505 ; EG-LABEL: s_test_imin_sle_v4i8:
507 ; EG-NEXT: ALU 0, @22, KC0[], KC1[]
509 ; EG-NEXT: ALU 30, @23, KC0[CB0:0-32], KC1[]
510 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
513 ; EG-NEXT: Fetch clause starting at 6:
514 ; EG-NEXT: VTX_READ_8 T5.X, T4.X, 74, #3
515 ; EG-NEXT: VTX_READ_8 T6.X, T4.X, 108, #3
516 ; EG-NEXT: VTX_READ_8 T7.X, T4.X, 72, #3
517 ; EG-NEXT: VTX_READ_8 T8.X, T4.X, 111, #3
518 ; EG-NEXT: VTX_READ_8 T9.X, T4.X, 75, #3
519 ; EG-NEXT: VTX_READ_8 T10.X, T4.X, 109, #3
520 ; EG-NEXT: VTX_READ_8 T11.X, T4.X, 73, #3
521 ; EG-NEXT: VTX_READ_8 T4.X, T4.X, 110, #3
522 ; EG-NEXT: ALU clause starting at 22:
523 ; EG-NEXT: MOV * T4.X, 0.0,
524 ; EG-NEXT: ALU clause starting at 23:
525 ; EG-NEXT: BFE_INT T0.Z, T5.X, 0.0, literal.x,
526 ; EG-NEXT: BFE_INT * T0.W, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212
527 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
528 ; EG-NEXT: BFE_INT T4.X, T11.X, 0.0, literal.x,
529 ; EG-NEXT: BFE_INT T0.Y, T10.X, 0.0, literal.x, BS:VEC_120/SCL_212
530 ; EG-NEXT: BFE_INT * T1.Z, T9.X, 0.0, literal.x, BS:VEC_201
531 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
532 ; EG-NEXT: BFE_INT T1.W, T8.X, 0.0, literal.x,
533 ; EG-NEXT: MIN_INT * T0.W, T0.Z, T0.W,
534 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
535 ; EG-NEXT: MIN_INT T0.Z, T1.Z, PV.W,
536 ; EG-NEXT: AND_INT T0.W, PS, literal.x,
537 ; EG-NEXT: MIN_INT * T1.W, T4.X, T0.Y,
538 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
539 ; EG-NEXT: AND_INT T4.X, PS, literal.x,
540 ; EG-NEXT: LSHL T0.Y, PV.W, literal.y,
541 ; EG-NEXT: BFE_INT T1.Z, T7.X, 0.0, literal.z,
542 ; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.z, BS:VEC_120/SCL_212
543 ; EG-NEXT: LSHL * T1.W, PV.Z, literal.w,
544 ; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
545 ; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
546 ; EG-NEXT: MIN_INT T0.Z, PV.Z, PV.W,
547 ; EG-NEXT: OR_INT T0.W, PS, PV.Y,
548 ; EG-NEXT: LSHL * T1.W, PV.X, literal.x,
549 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
550 ; EG-NEXT: OR_INT T0.W, PV.W, PS,
551 ; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x,
552 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
553 ; EG-NEXT: OR_INT T4.X, PV.W, PS,
554 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
555 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
557 ; CI-LABEL: s_test_imin_sle_v4i8:
559 ; CI-NEXT: s_load_dword s2, s[6:7], 0xa
560 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
561 ; CI-NEXT: s_load_dword s3, s[6:7], 0x13
562 ; CI-NEXT: s_waitcnt lgkmcnt(0)
563 ; CI-NEXT: s_ashr_i32 s4, s2, 24
564 ; CI-NEXT: s_sext_i32_i8 s5, s2
565 ; CI-NEXT: s_bfe_i32 s6, s2, 0x80008
566 ; CI-NEXT: s_bfe_i32 s2, s2, 0x80010
567 ; CI-NEXT: s_ashr_i32 s7, s3, 24
568 ; CI-NEXT: s_sext_i32_i8 s8, s3
569 ; CI-NEXT: s_bfe_i32 s9, s3, 0x80008
570 ; CI-NEXT: s_bfe_i32 s3, s3, 0x80010
571 ; CI-NEXT: s_min_i32 s2, s2, s3
572 ; CI-NEXT: s_min_i32 s4, s4, s7
573 ; CI-NEXT: s_and_b32 s2, s2, 0xff
574 ; CI-NEXT: s_lshl_b32 s4, s4, 24
575 ; CI-NEXT: s_lshl_b32 s2, s2, 16
576 ; CI-NEXT: s_or_b32 s2, s4, s2
577 ; CI-NEXT: s_min_i32 s3, s6, s9
578 ; CI-NEXT: s_min_i32 s4, s5, s8
579 ; CI-NEXT: s_lshl_b32 s3, s3, 8
580 ; CI-NEXT: s_and_b32 s4, s4, 0xff
581 ; CI-NEXT: s_or_b32 s3, s4, s3
582 ; CI-NEXT: s_and_b32 s3, s3, 0xffff
583 ; CI-NEXT: s_or_b32 s2, s3, s2
584 ; CI-NEXT: v_mov_b32_e32 v0, s0
585 ; CI-NEXT: v_mov_b32_e32 v1, s1
586 ; CI-NEXT: v_mov_b32_e32 v2, s2
587 ; CI-NEXT: flat_store_dword v[0:1], v2
590 ; VI-LABEL: s_test_imin_sle_v4i8:
592 ; VI-NEXT: s_load_dword s2, s[6:7], 0x28
593 ; VI-NEXT: s_load_dword s3, s[6:7], 0x4c
594 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
595 ; VI-NEXT: s_waitcnt lgkmcnt(0)
596 ; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s2
597 ; VI-NEXT: v_lshrrev_b16_e64 v1, 8, s3
598 ; VI-NEXT: s_ashr_i32 s4, s2, 24
599 ; VI-NEXT: s_bfe_i32 s5, s2, 0x80010
600 ; VI-NEXT: s_sext_i32_i8 s2, s2
601 ; VI-NEXT: s_ashr_i32 s6, s3, 24
602 ; VI-NEXT: s_bfe_i32 s7, s3, 0x80010
603 ; VI-NEXT: s_sext_i32_i8 s3, s3
604 ; VI-NEXT: s_min_i32 s4, s4, s6
605 ; VI-NEXT: s_min_i32 s2, s2, s3
606 ; VI-NEXT: s_min_i32 s3, s5, s7
607 ; VI-NEXT: v_min_i32_sdwa v0, sext(v0), sext(v1) dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
608 ; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s4
609 ; VI-NEXT: v_mov_b32_e32 v2, s3
610 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
611 ; VI-NEXT: v_mov_b32_e32 v2, s2
612 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
613 ; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
614 ; VI-NEXT: v_mov_b32_e32 v0, s0
615 ; VI-NEXT: v_mov_b32_e32 v1, s1
616 ; VI-NEXT: flat_store_dword v[0:1], v2
619 ; GFX9-LABEL: s_test_imin_sle_v4i8:
621 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28
622 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
623 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c
624 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
625 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
626 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16
627 ; GFX9-NEXT: s_bfe_i32 s6, s4, 0x80000
628 ; GFX9-NEXT: s_lshr_b32 s7, s3, 16
629 ; GFX9-NEXT: s_bfe_i32 s8, s7, 0x80000
630 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
631 ; GFX9-NEXT: v_mov_b32_e32 v2, s8
632 ; GFX9-NEXT: s_bfe_i32 s9, s3, 0x80000
633 ; GFX9-NEXT: v_min_i16_sdwa v1, sext(s4), sext(v1) dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
634 ; GFX9-NEXT: v_min_i16_e32 v2, s6, v2
635 ; GFX9-NEXT: s_bfe_i32 s5, s2, 0x80000
636 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
637 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
638 ; GFX9-NEXT: v_mov_b32_e32 v3, s9
639 ; GFX9-NEXT: v_min_i16_sdwa v2, sext(s2), sext(v2) dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
640 ; GFX9-NEXT: v_min_i16_e32 v3, s5, v3
641 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
642 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
643 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
644 ; GFX9-NEXT: s_endpgm
646 ; GFX10-LABEL: s_test_imin_sle_v4i8:
648 ; GFX10-NEXT: s_clause 0x2
649 ; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28
650 ; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c
651 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
652 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
653 ; GFX10-NEXT: s_lshr_b32 s4, s2, 16
654 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16
655 ; GFX10-NEXT: v_ashrrev_i16 v0, 8, s2
656 ; GFX10-NEXT: v_ashrrev_i16 v1, 8, s4
657 ; GFX10-NEXT: v_ashrrev_i16 v2, 8, s5
658 ; GFX10-NEXT: v_ashrrev_i16 v3, 8, s3
659 ; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80000
660 ; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80000
661 ; GFX10-NEXT: s_bfe_i32 s4, s4, 0x80000
662 ; GFX10-NEXT: v_min_i16 v1, v1, v2
663 ; GFX10-NEXT: v_min_i16 v0, v0, v3
664 ; GFX10-NEXT: s_bfe_i32 s5, s5, 0x80000
665 ; GFX10-NEXT: v_min_i16 v2, s2, s3
666 ; GFX10-NEXT: v_min_i16 v3, s4, s5
667 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
668 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
669 ; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
670 ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
671 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
672 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
673 ; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
674 ; GFX10-NEXT: s_endpgm
676 ; GFX11-LABEL: s_test_imin_sle_v4i8:
678 ; GFX11-NEXT: s_clause 0x1
679 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x28
680 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x4c
681 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
682 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16
683 ; GFX11-NEXT: s_lshr_b32 s5, s1, 16
684 ; GFX11-NEXT: v_ashrrev_i16 v0, 8, s0
685 ; GFX11-NEXT: v_ashrrev_i16 v1, 8, s1
686 ; GFX11-NEXT: v_ashrrev_i16 v2, 8, s4
687 ; GFX11-NEXT: v_ashrrev_i16 v3, 8, s5
688 ; GFX11-NEXT: s_bfe_i32 s0, s0, 0x80000
689 ; GFX11-NEXT: s_bfe_i32 s1, s1, 0x80000
690 ; GFX11-NEXT: s_bfe_i32 s4, s4, 0x80000
691 ; GFX11-NEXT: s_bfe_i32 s5, s5, 0x80000
692 ; GFX11-NEXT: v_min_i16 v4, s0, s1
693 ; GFX11-NEXT: v_min_i16 v5, s4, s5
694 ; GFX11-NEXT: v_min_i16 v2, v2, v3
695 ; GFX11-NEXT: v_min_i16 v0, v0, v1
696 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
697 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v4
698 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v5
699 ; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2
700 ; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0
701 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
702 ; GFX11-NEXT: v_or_b32_e32 v2, v3, v2
703 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
704 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
705 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
706 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
707 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
708 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
709 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
710 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
711 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
712 ; GFX11-NEXT: s_nop 0
713 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
714 ; GFX11-NEXT: s_endpgm
715 %cmp = icmp sle <4 x i8> %a, %b
716 %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
717 store <4 x i8> %val, ptr addrspace(1) %out
721 define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #0 {
722 ; EG-LABEL: s_test_imin_sle_v2i16:
724 ; EG-NEXT: ALU 0, @14, KC0[], KC1[]
726 ; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[]
727 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
730 ; EG-NEXT: Fetch clause starting at 6:
731 ; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
732 ; EG-NEXT: VTX_READ_16 T6.X, T4.X, 44, #3
733 ; EG-NEXT: VTX_READ_16 T7.X, T4.X, 40, #3
734 ; EG-NEXT: VTX_READ_16 T4.X, T4.X, 46, #3
735 ; EG-NEXT: ALU clause starting at 14:
736 ; EG-NEXT: MOV * T4.X, 0.0,
737 ; EG-NEXT: ALU clause starting at 15:
738 ; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x,
739 ; EG-NEXT: BFE_INT T0.Y, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212
740 ; EG-NEXT: BFE_INT * T0.Z, T7.X, 0.0, literal.x, BS:VEC_201
741 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
742 ; EG-NEXT: BFE_INT * T0.W, T6.X, 0.0, literal.x,
743 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
744 ; EG-NEXT: MIN_INT T0.W, T0.Z, PV.W,
745 ; EG-NEXT: MIN_INT * T1.W, T5.X, T0.Y,
746 ; EG-NEXT: LSHL T1.W, PS, literal.x,
747 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
748 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
749 ; EG-NEXT: OR_INT T4.X, PV.W, PS,
750 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
751 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
753 ; CI-LABEL: s_test_imin_sle_v2i16:
755 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
756 ; CI-NEXT: s_waitcnt lgkmcnt(0)
757 ; CI-NEXT: s_ashr_i32 s4, s2, 16
758 ; CI-NEXT: s_sext_i32_i16 s2, s2
759 ; CI-NEXT: s_ashr_i32 s5, s3, 16
760 ; CI-NEXT: s_sext_i32_i16 s3, s3
761 ; CI-NEXT: s_min_i32 s4, s4, s5
762 ; CI-NEXT: s_min_i32 s2, s2, s3
763 ; CI-NEXT: s_lshl_b32 s3, s4, 16
764 ; CI-NEXT: s_and_b32 s2, s2, 0xffff
765 ; CI-NEXT: s_or_b32 s2, s2, s3
766 ; CI-NEXT: v_mov_b32_e32 v0, s0
767 ; CI-NEXT: v_mov_b32_e32 v1, s1
768 ; CI-NEXT: v_mov_b32_e32 v2, s2
769 ; CI-NEXT: flat_store_dword v[0:1], v2
772 ; VI-LABEL: s_test_imin_sle_v2i16:
774 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
775 ; VI-NEXT: s_waitcnt lgkmcnt(0)
776 ; VI-NEXT: s_ashr_i32 s4, s2, 16
777 ; VI-NEXT: s_sext_i32_i16 s2, s2
778 ; VI-NEXT: s_ashr_i32 s5, s3, 16
779 ; VI-NEXT: s_sext_i32_i16 s3, s3
780 ; VI-NEXT: s_min_i32 s4, s4, s5
781 ; VI-NEXT: s_min_i32 s2, s2, s3
782 ; VI-NEXT: s_lshl_b32 s3, s4, 16
783 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
784 ; VI-NEXT: s_or_b32 s2, s2, s3
785 ; VI-NEXT: v_mov_b32_e32 v0, s0
786 ; VI-NEXT: v_mov_b32_e32 v1, s1
787 ; VI-NEXT: v_mov_b32_e32 v2, s2
788 ; VI-NEXT: flat_store_dword v[0:1], v2
791 ; GFX9-LABEL: s_test_imin_sle_v2i16:
793 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
794 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
795 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
796 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
797 ; GFX9-NEXT: v_pk_min_i16 v1, s2, v1
798 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
799 ; GFX9-NEXT: s_endpgm
801 ; GFX10-LABEL: s_test_imin_sle_v2i16:
803 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
804 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
805 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
806 ; GFX10-NEXT: v_pk_min_i16 v1, s2, s3
807 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
808 ; GFX10-NEXT: s_endpgm
810 ; GFX11-LABEL: s_test_imin_sle_v2i16:
812 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
813 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
814 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
815 ; GFX11-NEXT: v_pk_min_i16 v1, s2, s3
816 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
817 ; GFX11-NEXT: s_nop 0
818 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
819 ; GFX11-NEXT: s_endpgm
820 %cmp = icmp sle <2 x i16> %a, %b
821 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
822 store <2 x i16> %val, ptr addrspace(1) %out
826 define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16> %a, <4 x i16> %b) #0 {
827 ; EG-LABEL: s_test_imin_sle_v4i16:
829 ; EG-NEXT: ALU 1, @28, KC0[], KC1[]
831 ; EG-NEXT: ALU 9, @30, KC0[], KC1[]
833 ; EG-NEXT: ALU 10, @40, KC0[], KC1[]
835 ; EG-NEXT: ALU 10, @51, KC0[], KC1[]
837 ; EG-NEXT: ALU 11, @62, KC0[CB0:0-32], KC1[]
838 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XY, T5.X, 1
841 ; EG-NEXT: Fetch clause starting at 12:
842 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3
843 ; EG-NEXT: VTX_READ_16 T7.X, T5.X, 58, #3
844 ; EG-NEXT: Fetch clause starting at 16:
845 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3
846 ; EG-NEXT: VTX_READ_16 T7.X, T5.X, 56, #3
847 ; EG-NEXT: Fetch clause starting at 20:
848 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3
849 ; EG-NEXT: VTX_READ_16 T7.X, T5.X, 54, #3
850 ; EG-NEXT: Fetch clause starting at 24:
851 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
852 ; EG-NEXT: VTX_READ_16 T5.X, T5.X, 52, #3
853 ; EG-NEXT: ALU clause starting at 28:
854 ; EG-NEXT: MOV * T0.Y, T3.X,
855 ; EG-NEXT: MOV * T5.X, 0.0,
856 ; EG-NEXT: ALU clause starting at 30:
857 ; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
858 ; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
859 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
860 ; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
861 ; EG-NEXT: LSHL T0.W, PV.W, literal.x,
862 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
863 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
864 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
865 ; EG-NEXT: MOV * T3.X, PV.W,
866 ; EG-NEXT: MOV * T0.Y, PV.X,
867 ; EG-NEXT: ALU clause starting at 40:
868 ; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
869 ; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
870 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
871 ; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W,
872 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
873 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
874 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
875 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
876 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
877 ; EG-NEXT: MOV T3.X, PV.W,
878 ; EG-NEXT: MOV * T0.Y, T2.X,
879 ; EG-NEXT: ALU clause starting at 51:
880 ; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
881 ; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
882 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
883 ; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W,
884 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
885 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
886 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
887 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
888 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
889 ; EG-NEXT: MOV * T2.X, PV.W,
890 ; EG-NEXT: MOV * T0.Y, PV.X,
891 ; EG-NEXT: ALU clause starting at 62:
892 ; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
893 ; EG-NEXT: BFE_INT * T0.W, T5.X, 0.0, literal.x, BS:VEC_120/SCL_212
894 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
895 ; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
896 ; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
897 ; EG-NEXT: AND_INT T1.W, T0.Y, literal.y,
898 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.z,
899 ; EG-NEXT: 2(2.802597e-45), -65536(nan)
900 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
901 ; EG-NEXT: OR_INT * T6.X, PV.W, PS,
902 ; EG-NEXT: MOV T2.X, PV.X,
903 ; EG-NEXT: MOV * T6.Y, T3.X,
905 ; CI-LABEL: s_test_imin_sle_v4i16:
907 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2
908 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
909 ; CI-NEXT: s_waitcnt lgkmcnt(0)
910 ; CI-NEXT: s_ashr_i32 s6, s0, 16
911 ; CI-NEXT: s_ashr_i32 s7, s1, 16
912 ; CI-NEXT: s_sext_i32_i16 s0, s0
913 ; CI-NEXT: s_sext_i32_i16 s1, s1
914 ; CI-NEXT: s_ashr_i32 s8, s2, 16
915 ; CI-NEXT: s_ashr_i32 s9, s3, 16
916 ; CI-NEXT: s_sext_i32_i16 s2, s2
917 ; CI-NEXT: s_sext_i32_i16 s3, s3
918 ; CI-NEXT: s_min_i32 s7, s7, s9
919 ; CI-NEXT: s_min_i32 s1, s1, s3
920 ; CI-NEXT: s_min_i32 s3, s6, s8
921 ; CI-NEXT: s_min_i32 s0, s0, s2
922 ; CI-NEXT: s_lshl_b32 s7, s7, 16
923 ; CI-NEXT: s_and_b32 s1, s1, 0xffff
924 ; CI-NEXT: s_lshl_b32 s3, s3, 16
925 ; CI-NEXT: s_and_b32 s0, s0, 0xffff
926 ; CI-NEXT: s_or_b32 s1, s1, s7
927 ; CI-NEXT: s_or_b32 s0, s0, s3
928 ; CI-NEXT: v_mov_b32_e32 v2, s4
929 ; CI-NEXT: v_mov_b32_e32 v0, s0
930 ; CI-NEXT: v_mov_b32_e32 v1, s1
931 ; CI-NEXT: v_mov_b32_e32 v3, s5
932 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
935 ; VI-LABEL: s_test_imin_sle_v4i16:
937 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8
938 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
939 ; VI-NEXT: s_waitcnt lgkmcnt(0)
940 ; VI-NEXT: s_ashr_i32 s6, s1, 16
941 ; VI-NEXT: s_sext_i32_i16 s1, s1
942 ; VI-NEXT: s_ashr_i32 s8, s3, 16
943 ; VI-NEXT: s_sext_i32_i16 s3, s3
944 ; VI-NEXT: s_ashr_i32 s7, s0, 16
945 ; VI-NEXT: s_sext_i32_i16 s0, s0
946 ; VI-NEXT: s_ashr_i32 s9, s2, 16
947 ; VI-NEXT: s_sext_i32_i16 s2, s2
948 ; VI-NEXT: s_min_i32 s6, s6, s8
949 ; VI-NEXT: s_min_i32 s1, s1, s3
950 ; VI-NEXT: s_min_i32 s7, s7, s9
951 ; VI-NEXT: s_min_i32 s0, s0, s2
952 ; VI-NEXT: s_lshl_b32 s2, s6, 16
953 ; VI-NEXT: s_and_b32 s1, s1, 0xffff
954 ; VI-NEXT: s_or_b32 s1, s1, s2
955 ; VI-NEXT: s_lshl_b32 s2, s7, 16
956 ; VI-NEXT: s_and_b32 s0, s0, 0xffff
957 ; VI-NEXT: s_or_b32 s0, s0, s2
958 ; VI-NEXT: v_mov_b32_e32 v2, s4
959 ; VI-NEXT: v_mov_b32_e32 v0, s0
960 ; VI-NEXT: v_mov_b32_e32 v1, s1
961 ; VI-NEXT: v_mov_b32_e32 v3, s5
962 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
965 ; GFX9-LABEL: s_test_imin_sle_v4i16:
967 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8
968 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
969 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
970 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
971 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
972 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
973 ; GFX9-NEXT: v_pk_min_i16 v1, s1, v0
974 ; GFX9-NEXT: v_pk_min_i16 v0, s0, v3
975 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
976 ; GFX9-NEXT: s_endpgm
978 ; GFX10-LABEL: s_test_imin_sle_v4i16:
980 ; GFX10-NEXT: s_clause 0x1
981 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8
982 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
983 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
984 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
985 ; GFX10-NEXT: v_pk_min_i16 v1, s1, s3
986 ; GFX10-NEXT: v_pk_min_i16 v0, s0, s2
987 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
988 ; GFX10-NEXT: s_endpgm
990 ; GFX11-LABEL: s_test_imin_sle_v4i16:
992 ; GFX11-NEXT: s_clause 0x1
993 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8
994 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
995 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
996 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
997 ; GFX11-NEXT: v_pk_min_i16 v1, s5, s7
998 ; GFX11-NEXT: v_pk_min_i16 v0, s4, s6
999 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1000 ; GFX11-NEXT: s_nop 0
1001 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1002 ; GFX11-NEXT: s_endpgm
1003 %cmp = icmp sle <4 x i16> %a, %b
1004 %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
1005 store <4 x i16> %val, ptr addrspace(1) %out
1009 define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
1010 ; EG-LABEL: v_test_imin_slt_i32:
1012 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
1014 ; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
1015 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1018 ; EG-NEXT: Fetch clause starting at 6:
1019 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
1020 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1021 ; EG-NEXT: ALU clause starting at 10:
1022 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1023 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1024 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
1025 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
1026 ; EG-NEXT: ALU clause starting at 14:
1027 ; EG-NEXT: MIN_INT T0.X, T0.X, T1.X,
1028 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1029 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
1030 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1032 ; CI-LABEL: v_test_imin_slt_i32:
1034 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1035 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
1036 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1037 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1038 ; CI-NEXT: v_mov_b32_e32 v1, s3
1039 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
1040 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1041 ; CI-NEXT: v_mov_b32_e32 v3, s5
1042 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
1043 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1044 ; CI-NEXT: flat_load_dword v5, v[0:1]
1045 ; CI-NEXT: flat_load_dword v2, v[2:3]
1046 ; CI-NEXT: v_mov_b32_e32 v1, s1
1047 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
1048 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1049 ; CI-NEXT: s_waitcnt vmcnt(0)
1050 ; CI-NEXT: v_min_i32_e32 v2, v5, v2
1051 ; CI-NEXT: flat_store_dword v[0:1], v2
1054 ; VI-LABEL: v_test_imin_slt_i32:
1056 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1057 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1058 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1059 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1060 ; VI-NEXT: v_mov_b32_e32 v1, s3
1061 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1062 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1063 ; VI-NEXT: v_mov_b32_e32 v3, s5
1064 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1065 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1066 ; VI-NEXT: flat_load_dword v5, v[0:1]
1067 ; VI-NEXT: flat_load_dword v2, v[2:3]
1068 ; VI-NEXT: v_mov_b32_e32 v1, s1
1069 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
1070 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1071 ; VI-NEXT: s_waitcnt vmcnt(0)
1072 ; VI-NEXT: v_min_i32_e32 v2, v5, v2
1073 ; VI-NEXT: flat_store_dword v[0:1], v2
1076 ; GFX9-LABEL: v_test_imin_slt_i32:
1078 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1079 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1080 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1081 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1082 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1083 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
1084 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1085 ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2
1086 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1087 ; GFX9-NEXT: s_endpgm
1089 ; GFX10-LABEL: v_test_imin_slt_i32:
1091 ; GFX10-NEXT: s_clause 0x1
1092 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1093 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1094 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1095 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1096 ; GFX10-NEXT: s_clause 0x1
1097 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1098 ; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
1099 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1100 ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2
1101 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1102 ; GFX10-NEXT: s_endpgm
1104 ; GFX11-LABEL: v_test_imin_slt_i32:
1106 ; GFX11-NEXT: s_clause 0x1
1107 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
1108 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
1109 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1110 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1111 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1112 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1113 ; GFX11-NEXT: s_clause 0x1
1114 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
1115 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1]
1116 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1117 ; GFX11-NEXT: v_min_i32_e32 v1, v1, v2
1118 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
1119 ; GFX11-NEXT: s_nop 0
1120 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1121 ; GFX11-NEXT: s_endpgm
1122 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1123 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
1124 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
1125 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
1126 %a = load i32, ptr addrspace(1) %a.gep, align 4
1127 %b = load i32, ptr addrspace(1) %b.gep, align 4
1128 %cmp = icmp slt i32 %a, %b
1129 %val = select i1 %cmp, i32 %a, i32 %b
1130 store i32 %val, ptr addrspace(1) %out.gep, align 4
1134 define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
1135 ; EG-LABEL: v_test_imin_slt_i16:
1137 ; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[]
1139 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
1140 ; EG-NEXT: TEX 0 @10
1141 ; EG-NEXT: ALU 16, @15, KC0[CB0:0-32], KC1[]
1142 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X
1145 ; EG-NEXT: Fetch clause starting at 8:
1146 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1147 ; EG-NEXT: Fetch clause starting at 10:
1148 ; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
1149 ; EG-NEXT: ALU clause starting at 12:
1150 ; EG-NEXT: LSHL * T0.W, T0.X, 1,
1151 ; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W,
1152 ; EG-NEXT: ALU clause starting at 14:
1153 ; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, T0.W,
1154 ; EG-NEXT: ALU clause starting at 15:
1155 ; EG-NEXT: BFE_INT T0.Z, T0.X, 0.0, literal.x,
1156 ; EG-NEXT: BFE_INT T1.W, T1.X, 0.0, literal.x, BS:VEC_120/SCL_212
1157 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1158 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1159 ; EG-NEXT: AND_INT T2.W, PS, literal.x,
1160 ; EG-NEXT: MIN_INT * T1.W, PV.W, PV.Z,
1161 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1162 ; EG-NEXT: AND_INT T1.W, PS, literal.x,
1163 ; EG-NEXT: LSHL * T2.W, PV.W, literal.y,
1164 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1165 ; EG-NEXT: LSHL T1.X, PV.W, PS,
1166 ; EG-NEXT: LSHL * T1.W, literal.x, PS,
1167 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1168 ; EG-NEXT: MOV T1.Y, 0.0,
1169 ; EG-NEXT: MOV * T1.Z, 0.0,
1170 ; EG-NEXT: LSHR * T0.X, T0.W, literal.x,
1171 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1173 ; CI-LABEL: v_test_imin_slt_i16:
1175 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1176 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
1177 ; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
1178 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1179 ; CI-NEXT: v_mov_b32_e32 v1, s3
1180 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
1181 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1182 ; CI-NEXT: v_mov_b32_e32 v3, s5
1183 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
1184 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1185 ; CI-NEXT: flat_load_sshort v5, v[0:1]
1186 ; CI-NEXT: flat_load_sshort v2, v[2:3]
1187 ; CI-NEXT: v_mov_b32_e32 v1, s1
1188 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
1189 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1190 ; CI-NEXT: s_waitcnt vmcnt(0)
1191 ; CI-NEXT: v_min_i32_e32 v2, v5, v2
1192 ; CI-NEXT: flat_store_short v[0:1], v2
1195 ; VI-LABEL: v_test_imin_slt_i16:
1197 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1198 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1199 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
1200 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1201 ; VI-NEXT: v_mov_b32_e32 v1, s3
1202 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1203 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1204 ; VI-NEXT: v_mov_b32_e32 v3, s5
1205 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1206 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1207 ; VI-NEXT: flat_load_ushort v5, v[0:1]
1208 ; VI-NEXT: flat_load_ushort v2, v[2:3]
1209 ; VI-NEXT: v_mov_b32_e32 v1, s1
1210 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
1211 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1212 ; VI-NEXT: s_waitcnt vmcnt(0)
1213 ; VI-NEXT: v_min_i16_e32 v2, v5, v2
1214 ; VI-NEXT: flat_store_short v[0:1], v2
1217 ; GFX9-LABEL: v_test_imin_slt_i16:
1219 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1220 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1221 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1222 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1223 ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
1224 ; GFX9-NEXT: global_load_ushort v2, v0, s[4:5]
1225 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1226 ; GFX9-NEXT: v_min_i16_e32 v1, v1, v2
1227 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
1228 ; GFX9-NEXT: s_endpgm
1230 ; GFX10-LABEL: v_test_imin_slt_i16:
1232 ; GFX10-NEXT: s_clause 0x1
1233 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1234 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1235 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1236 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1237 ; GFX10-NEXT: s_clause 0x1
1238 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
1239 ; GFX10-NEXT: global_load_ushort v2, v0, s[4:5]
1240 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1241 ; GFX10-NEXT: v_min_i16 v1, v1, v2
1242 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
1243 ; GFX10-NEXT: s_endpgm
1245 ; GFX11-LABEL: v_test_imin_slt_i16:
1247 ; GFX11-NEXT: s_clause 0x1
1248 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
1249 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
1250 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1251 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1252 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1253 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1254 ; GFX11-NEXT: s_clause 0x1
1255 ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
1256 ; GFX11-NEXT: global_load_u16 v2, v0, s[0:1]
1257 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1258 ; GFX11-NEXT: v_min_i16 v1, v1, v2
1259 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
1260 ; GFX11-NEXT: s_nop 0
1261 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1262 ; GFX11-NEXT: s_endpgm
1263 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1264 %a.gep = getelementptr inbounds i16, ptr addrspace(1) %aptr, i32 %tid
1265 %b.gep = getelementptr inbounds i16, ptr addrspace(1) %bptr, i32 %tid
1266 %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
1268 %a = load i16, ptr addrspace(1) %a.gep
1269 %b = load i16, ptr addrspace(1) %b.gep
1270 %cmp = icmp slt i16 %a, %b
1271 %val = select i1 %cmp, i16 %a, i16 %b
1272 store i16 %val, ptr addrspace(1) %out.gep
1276 define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
1277 ; EG-LABEL: s_test_imin_slt_i32:
1279 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1280 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1283 ; EG-NEXT: ALU clause starting at 4:
1284 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
1285 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
1286 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1288 ; CI-LABEL: s_test_imin_slt_i32:
1290 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1291 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1292 ; CI-NEXT: s_min_i32 s2, s2, s3
1293 ; CI-NEXT: v_mov_b32_e32 v0, s0
1294 ; CI-NEXT: v_mov_b32_e32 v1, s1
1295 ; CI-NEXT: v_mov_b32_e32 v2, s2
1296 ; CI-NEXT: flat_store_dword v[0:1], v2
1299 ; VI-LABEL: s_test_imin_slt_i32:
1301 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1302 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1303 ; VI-NEXT: s_min_i32 s2, s2, s3
1304 ; VI-NEXT: v_mov_b32_e32 v0, s0
1305 ; VI-NEXT: v_mov_b32_e32 v1, s1
1306 ; VI-NEXT: v_mov_b32_e32 v2, s2
1307 ; VI-NEXT: flat_store_dword v[0:1], v2
1310 ; GFX9-LABEL: s_test_imin_slt_i32:
1312 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1313 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1314 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1315 ; GFX9-NEXT: s_min_i32 s2, s2, s3
1316 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1317 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1318 ; GFX9-NEXT: s_endpgm
1320 ; GFX10-LABEL: s_test_imin_slt_i32:
1322 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1323 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1324 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1325 ; GFX10-NEXT: s_min_i32 s2, s2, s3
1326 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
1327 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1328 ; GFX10-NEXT: s_endpgm
1330 ; GFX11-LABEL: s_test_imin_slt_i32:
1332 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
1333 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1334 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1335 ; GFX11-NEXT: s_min_i32 s2, s2, s3
1336 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1337 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
1338 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1339 ; GFX11-NEXT: s_nop 0
1340 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1341 ; GFX11-NEXT: s_endpgm
1342 %cmp = icmp slt i32 %a, %b
1343 %val = select i1 %cmp, i32 %a, i32 %b
1344 store i32 %val, ptr addrspace(1) %out, align 4
1348 define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
1349 ; EG-LABEL: s_test_imin_slt_v2i32:
1351 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
1352 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1355 ; EG-NEXT: ALU clause starting at 4:
1356 ; EG-NEXT: MIN_INT * T0.Y, KC0[3].X, KC0[3].Z,
1357 ; EG-NEXT: MIN_INT * T0.X, KC0[2].W, KC0[3].Y,
1358 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1359 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1361 ; CI-LABEL: s_test_imin_slt_v2i32:
1363 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2
1364 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1365 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1366 ; CI-NEXT: s_min_i32 s1, s1, s3
1367 ; CI-NEXT: s_min_i32 s0, s0, s2
1368 ; CI-NEXT: v_mov_b32_e32 v2, s4
1369 ; CI-NEXT: v_mov_b32_e32 v0, s0
1370 ; CI-NEXT: v_mov_b32_e32 v1, s1
1371 ; CI-NEXT: v_mov_b32_e32 v3, s5
1372 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1375 ; VI-LABEL: s_test_imin_slt_v2i32:
1377 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8
1378 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1379 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1380 ; VI-NEXT: s_min_i32 s1, s1, s3
1381 ; VI-NEXT: s_min_i32 s0, s0, s2
1382 ; VI-NEXT: v_mov_b32_e32 v2, s4
1383 ; VI-NEXT: v_mov_b32_e32 v0, s0
1384 ; VI-NEXT: v_mov_b32_e32 v1, s1
1385 ; VI-NEXT: v_mov_b32_e32 v3, s5
1386 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1389 ; GFX9-LABEL: s_test_imin_slt_v2i32:
1391 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8
1392 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1393 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1394 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1395 ; GFX9-NEXT: s_min_i32 s1, s1, s3
1396 ; GFX9-NEXT: s_min_i32 s0, s0, s2
1397 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1398 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1399 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
1400 ; GFX9-NEXT: s_endpgm
1402 ; GFX10-LABEL: s_test_imin_slt_v2i32:
1404 ; GFX10-NEXT: s_clause 0x1
1405 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8
1406 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1407 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1408 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1409 ; GFX10-NEXT: s_min_i32 s0, s0, s2
1410 ; GFX10-NEXT: s_min_i32 s1, s1, s3
1411 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1412 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1413 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
1414 ; GFX10-NEXT: s_endpgm
1416 ; GFX11-LABEL: s_test_imin_slt_v2i32:
1418 ; GFX11-NEXT: s_clause 0x1
1419 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8
1420 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
1421 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1422 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1423 ; GFX11-NEXT: s_min_i32 s2, s4, s6
1424 ; GFX11-NEXT: s_min_i32 s3, s5, s7
1425 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
1426 ; GFX11-NEXT: v_mov_b32_e32 v1, s3
1427 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1428 ; GFX11-NEXT: s_nop 0
1429 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1430 ; GFX11-NEXT: s_endpgm
1431 %cmp = icmp slt <2 x i32> %a, %b
1432 %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
1433 store <2 x i32> %val, ptr addrspace(1) %out
1437 define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a) #0 {
1438 ; EG-LABEL: s_test_imin_slt_imm_i32:
1440 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1441 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1444 ; EG-NEXT: ALU clause starting at 4:
1445 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
1446 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, literal.y,
1447 ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
1449 ; CI-LABEL: s_test_imin_slt_imm_i32:
1451 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2
1452 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
1453 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1454 ; CI-NEXT: s_min_i32 s2, s2, 8
1455 ; CI-NEXT: v_mov_b32_e32 v0, s0
1456 ; CI-NEXT: v_mov_b32_e32 v1, s1
1457 ; CI-NEXT: v_mov_b32_e32 v2, s2
1458 ; CI-NEXT: flat_store_dword v[0:1], v2
1461 ; VI-LABEL: s_test_imin_slt_imm_i32:
1463 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
1464 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
1465 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1466 ; VI-NEXT: s_min_i32 s2, s2, 8
1467 ; VI-NEXT: v_mov_b32_e32 v0, s0
1468 ; VI-NEXT: v_mov_b32_e32 v1, s1
1469 ; VI-NEXT: v_mov_b32_e32 v2, s2
1470 ; VI-NEXT: flat_store_dword v[0:1], v2
1473 ; GFX9-LABEL: s_test_imin_slt_imm_i32:
1475 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8
1476 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
1477 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1478 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1479 ; GFX9-NEXT: s_min_i32 s2, s2, 8
1480 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1481 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1482 ; GFX9-NEXT: s_endpgm
1484 ; GFX10-LABEL: s_test_imin_slt_imm_i32:
1486 ; GFX10-NEXT: s_clause 0x1
1487 ; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8
1488 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
1489 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1490 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1491 ; GFX10-NEXT: s_min_i32 s2, s2, 8
1492 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
1493 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1494 ; GFX10-NEXT: s_endpgm
1496 ; GFX11-LABEL: s_test_imin_slt_imm_i32:
1498 ; GFX11-NEXT: s_clause 0x1
1499 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
1500 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
1501 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1502 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1503 ; GFX11-NEXT: s_min_i32 s2, s4, 8
1504 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1505 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
1506 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1507 ; GFX11-NEXT: s_nop 0
1508 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1509 ; GFX11-NEXT: s_endpgm
1510 %cmp = icmp slt i32 %a, 8
1511 %val = select i1 %cmp, i32 %a, i32 8
1512 store i32 %val, ptr addrspace(1) %out, align 4
1516 define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a) #0 {
1517 ; EG-LABEL: s_test_imin_sle_imm_i32:
1519 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1520 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1523 ; EG-NEXT: ALU clause starting at 4:
1524 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
1525 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, literal.y,
1526 ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
1528 ; CI-LABEL: s_test_imin_sle_imm_i32:
1530 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2
1531 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
1532 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1533 ; CI-NEXT: s_min_i32 s2, s2, 8
1534 ; CI-NEXT: v_mov_b32_e32 v0, s0
1535 ; CI-NEXT: v_mov_b32_e32 v1, s1
1536 ; CI-NEXT: v_mov_b32_e32 v2, s2
1537 ; CI-NEXT: flat_store_dword v[0:1], v2
1540 ; VI-LABEL: s_test_imin_sle_imm_i32:
1542 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
1543 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
1544 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1545 ; VI-NEXT: s_min_i32 s2, s2, 8
1546 ; VI-NEXT: v_mov_b32_e32 v0, s0
1547 ; VI-NEXT: v_mov_b32_e32 v1, s1
1548 ; VI-NEXT: v_mov_b32_e32 v2, s2
1549 ; VI-NEXT: flat_store_dword v[0:1], v2
1552 ; GFX9-LABEL: s_test_imin_sle_imm_i32:
1554 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8
1555 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
1556 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1557 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1558 ; GFX9-NEXT: s_min_i32 s2, s2, 8
1559 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1560 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1561 ; GFX9-NEXT: s_endpgm
1563 ; GFX10-LABEL: s_test_imin_sle_imm_i32:
1565 ; GFX10-NEXT: s_clause 0x1
1566 ; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8
1567 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
1568 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1569 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1570 ; GFX10-NEXT: s_min_i32 s2, s2, 8
1571 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
1572 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1573 ; GFX10-NEXT: s_endpgm
1575 ; GFX11-LABEL: s_test_imin_sle_imm_i32:
1577 ; GFX11-NEXT: s_clause 0x1
1578 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
1579 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
1580 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1581 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1582 ; GFX11-NEXT: s_min_i32 s2, s4, 8
1583 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1584 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
1585 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1586 ; GFX11-NEXT: s_nop 0
1587 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1588 ; GFX11-NEXT: s_endpgm
1589 %cmp = icmp sle i32 %a, 8
1590 %val = select i1 %cmp, i32 %a, i32 8
1591 store i32 %val, ptr addrspace(1) %out, align 4
1595 define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1596 ; EG-LABEL: v_test_umin_ule_i32:
1598 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
1600 ; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
1601 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1604 ; EG-NEXT: Fetch clause starting at 6:
1605 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
1606 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1607 ; EG-NEXT: ALU clause starting at 10:
1608 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1609 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1610 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
1611 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
1612 ; EG-NEXT: ALU clause starting at 14:
1613 ; EG-NEXT: MIN_UINT T0.X, T0.X, T1.X,
1614 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1615 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
1616 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1618 ; CI-LABEL: v_test_umin_ule_i32:
1620 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1621 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
1622 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1623 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1624 ; CI-NEXT: v_mov_b32_e32 v1, s3
1625 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
1626 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1627 ; CI-NEXT: v_mov_b32_e32 v3, s5
1628 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
1629 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1630 ; CI-NEXT: flat_load_dword v5, v[0:1]
1631 ; CI-NEXT: flat_load_dword v2, v[2:3]
1632 ; CI-NEXT: v_mov_b32_e32 v1, s1
1633 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
1634 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1635 ; CI-NEXT: s_waitcnt vmcnt(0)
1636 ; CI-NEXT: v_min_u32_e32 v2, v5, v2
1637 ; CI-NEXT: flat_store_dword v[0:1], v2
1640 ; VI-LABEL: v_test_umin_ule_i32:
1642 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1643 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1644 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1645 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1646 ; VI-NEXT: v_mov_b32_e32 v1, s3
1647 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1648 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1649 ; VI-NEXT: v_mov_b32_e32 v3, s5
1650 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1651 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1652 ; VI-NEXT: flat_load_dword v5, v[0:1]
1653 ; VI-NEXT: flat_load_dword v2, v[2:3]
1654 ; VI-NEXT: v_mov_b32_e32 v1, s1
1655 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
1656 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1657 ; VI-NEXT: s_waitcnt vmcnt(0)
1658 ; VI-NEXT: v_min_u32_e32 v2, v5, v2
1659 ; VI-NEXT: flat_store_dword v[0:1], v2
1662 ; GFX9-LABEL: v_test_umin_ule_i32:
1664 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1665 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1666 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1667 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1668 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1669 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
1670 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1671 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2
1672 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1673 ; GFX9-NEXT: s_endpgm
1675 ; GFX10-LABEL: v_test_umin_ule_i32:
1677 ; GFX10-NEXT: s_clause 0x1
1678 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1679 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1680 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1681 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1682 ; GFX10-NEXT: s_clause 0x1
1683 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1684 ; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
1685 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1686 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2
1687 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1688 ; GFX10-NEXT: s_endpgm
1690 ; GFX11-LABEL: v_test_umin_ule_i32:
1692 ; GFX11-NEXT: s_clause 0x1
1693 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
1694 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
1695 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1696 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1697 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1698 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1699 ; GFX11-NEXT: s_clause 0x1
1700 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
1701 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1]
1702 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1703 ; GFX11-NEXT: v_min_u32_e32 v1, v1, v2
1704 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
1705 ; GFX11-NEXT: s_nop 0
1706 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1707 ; GFX11-NEXT: s_endpgm
1708 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1709 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
1710 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
1711 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
1712 %a = load i32, ptr addrspace(1) %a.gep, align 4
1713 %b = load i32, ptr addrspace(1) %b.gep, align 4
1714 %cmp = icmp ule i32 %a, %b
1715 %val = select i1 %cmp, i32 %a, i32 %b
1716 store i32 %val, ptr addrspace(1) %out.gep, align 4
1720 define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1721 ; EG-LABEL: v_test_umin_ule_v3i32:
1723 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
1725 ; EG-NEXT: ALU 9, @14, KC0[CB0:0-32], KC1[]
1726 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
1727 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1729 ; EG-NEXT: Fetch clause starting at 6:
1730 ; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1
1731 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1
1732 ; EG-NEXT: ALU clause starting at 10:
1733 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1734 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1735 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
1736 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
1737 ; EG-NEXT: ALU clause starting at 14:
1738 ; EG-NEXT: MIN_UINT * T0.Y, T2.Y, T1.Y,
1739 ; EG-NEXT: MIN_UINT T0.X, T2.X, T1.X,
1740 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1741 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
1742 ; EG-NEXT: MIN_UINT * T2.X, T2.Z, T1.Z,
1743 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1744 ; EG-NEXT: ADD_INT * T0.W, T0.W, literal.x,
1745 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1746 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
1747 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1749 ; CI-LABEL: v_test_umin_ule_v3i32:
1751 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1752 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
1753 ; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1754 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1755 ; CI-NEXT: v_mov_b32_e32 v1, s3
1756 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v6
1757 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1758 ; CI-NEXT: v_mov_b32_e32 v2, s5
1759 ; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v6
1760 ; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
1761 ; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
1762 ; CI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
1763 ; CI-NEXT: v_mov_b32_e32 v7, s1
1764 ; CI-NEXT: v_add_i32_e32 v6, vcc, s0, v6
1765 ; CI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
1766 ; CI-NEXT: s_waitcnt vmcnt(0)
1767 ; CI-NEXT: v_min_u32_e32 v2, v2, v5
1768 ; CI-NEXT: v_min_u32_e32 v1, v1, v4
1769 ; CI-NEXT: v_min_u32_e32 v0, v0, v3
1770 ; CI-NEXT: flat_store_dwordx3 v[6:7], v[0:2]
1773 ; VI-LABEL: v_test_umin_ule_v3i32:
1775 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1776 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1777 ; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1778 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1779 ; VI-NEXT: v_mov_b32_e32 v1, s3
1780 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6
1781 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1782 ; VI-NEXT: v_mov_b32_e32 v2, s5
1783 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v6
1784 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
1785 ; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
1786 ; VI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
1787 ; VI-NEXT: v_mov_b32_e32 v7, s1
1788 ; VI-NEXT: v_add_u32_e32 v6, vcc, s0, v6
1789 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
1790 ; VI-NEXT: s_waitcnt vmcnt(0)
1791 ; VI-NEXT: v_min_u32_e32 v2, v2, v5
1792 ; VI-NEXT: v_min_u32_e32 v1, v1, v4
1793 ; VI-NEXT: v_min_u32_e32 v0, v0, v3
1794 ; VI-NEXT: flat_store_dwordx3 v[6:7], v[0:2]
1797 ; GFX9-LABEL: v_test_umin_ule_v3i32:
1799 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1800 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1801 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1802 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1803 ; GFX9-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3]
1804 ; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5]
1805 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1806 ; GFX9-NEXT: v_min_u32_e32 v2, v2, v5
1807 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v4
1808 ; GFX9-NEXT: v_min_u32_e32 v0, v0, v3
1809 ; GFX9-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
1810 ; GFX9-NEXT: s_endpgm
1812 ; GFX10-LABEL: v_test_umin_ule_v3i32:
1814 ; GFX10-NEXT: s_clause 0x1
1815 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1816 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1817 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1818 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1819 ; GFX10-NEXT: s_clause 0x1
1820 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3]
1821 ; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5]
1822 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1823 ; GFX10-NEXT: v_min_u32_e32 v2, v2, v5
1824 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v4
1825 ; GFX10-NEXT: v_min_u32_e32 v0, v0, v3
1826 ; GFX10-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
1827 ; GFX10-NEXT: s_endpgm
1829 ; GFX11-LABEL: v_test_umin_ule_v3i32:
1831 ; GFX11-NEXT: s_clause 0x1
1832 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
1833 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
1834 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1835 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1836 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1837 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1838 ; GFX11-NEXT: s_clause 0x1
1839 ; GFX11-NEXT: global_load_b96 v[0:2], v6, s[6:7]
1840 ; GFX11-NEXT: global_load_b96 v[3:5], v6, s[0:1]
1841 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1842 ; GFX11-NEXT: v_min_u32_e32 v2, v2, v5
1843 ; GFX11-NEXT: v_min_u32_e32 v1, v1, v4
1844 ; GFX11-NEXT: v_min_u32_e32 v0, v0, v3
1845 ; GFX11-NEXT: global_store_b96 v6, v[0:2], s[4:5]
1846 ; GFX11-NEXT: s_nop 0
1847 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1848 ; GFX11-NEXT: s_endpgm
1849 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1850 %a.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %a.ptr, i32 %tid
1851 %b.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %b.ptr, i32 %tid
1852 %out.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %out, i32 %tid
1854 %a = load <3 x i32>, ptr addrspace(1) %a.gep
1855 %b = load <3 x i32>, ptr addrspace(1) %b.gep
1856 %cmp = icmp ule <3 x i32> %a, %b
1857 %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
1858 store <3 x i32> %val, ptr addrspace(1) %out.gep
1862 ; FIXME: Reduce unused packed component to scalar
1864 define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1865 ; EG-LABEL: v_test_umin_ule_v3i16:
1867 ; EG-NEXT: ALU 3, @20, KC0[CB0:0-32], KC1[]
1869 ; EG-NEXT: ALU 11, @24, KC0[CB0:0-32], KC1[]
1870 ; EG-NEXT: TEX 3 @12
1871 ; EG-NEXT: ALU 8, @36, KC0[], KC1[]
1872 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T8.X, 0
1873 ; EG-NEXT: MEM_RAT MSKOR T7.XW, T0.X
1875 ; EG-NEXT: Fetch clause starting at 8:
1876 ; EG-NEXT: VTX_READ_16 T7.X, T6.X, 4, #1
1877 ; EG-NEXT: VTX_READ_16 T8.X, T0.X, 4, #1
1878 ; EG-NEXT: Fetch clause starting at 12:
1879 ; EG-NEXT: VTX_READ_16 T8.X, T6.X, 0, #1
1880 ; EG-NEXT: VTX_READ_16 T9.X, T0.X, 0, #1
1881 ; EG-NEXT: VTX_READ_16 T6.X, T6.X, 2, #1
1882 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1
1883 ; EG-NEXT: ALU clause starting at 20:
1884 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1885 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1886 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
1887 ; EG-NEXT: ADD_INT * T6.X, KC0[2].W, PV.W,
1888 ; EG-NEXT: ALU clause starting at 24:
1889 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1890 ; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x,
1891 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1892 ; EG-NEXT: AND_INT * T2.W, PV.W, literal.x,
1893 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1894 ; EG-NEXT: LSHL T2.W, PV.W, literal.x,
1895 ; EG-NEXT: MIN_UINT * T3.W, T8.X, T7.X,
1896 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1897 ; EG-NEXT: LSHL T7.X, PS, PV.W,
1898 ; EG-NEXT: LSHL * T7.W, literal.x, PV.W,
1899 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1900 ; EG-NEXT: MOV * T7.Y, 0.0,
1901 ; EG-NEXT: ALU clause starting at 36:
1902 ; EG-NEXT: MOV T7.Z, 0.0,
1903 ; EG-NEXT: MIN_UINT * T2.W, T0.X, T6.X,
1904 ; EG-NEXT: LSHR T0.X, T1.W, literal.x,
1905 ; EG-NEXT: LSHL T1.W, PV.W, literal.y,
1906 ; EG-NEXT: MIN_UINT * T2.W, T9.X, T8.X,
1907 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
1908 ; EG-NEXT: OR_INT T6.X, PV.W, PS,
1909 ; EG-NEXT: LSHR * T8.X, T0.W, literal.x,
1910 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1912 ; CI-LABEL: v_test_umin_ule_v3i16:
1914 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1915 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
1916 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1917 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1918 ; CI-NEXT: v_mov_b32_e32 v1, s3
1919 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
1920 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1921 ; CI-NEXT: v_mov_b32_e32 v3, s5
1922 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
1923 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1924 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1925 ; CI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1926 ; CI-NEXT: v_mov_b32_e32 v5, s1
1927 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
1928 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
1929 ; CI-NEXT: v_add_i32_e32 v6, vcc, 4, v4
1930 ; CI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
1931 ; CI-NEXT: s_waitcnt vmcnt(1)
1932 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0
1933 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1934 ; CI-NEXT: s_waitcnt vmcnt(0)
1935 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
1936 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1937 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
1938 ; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3
1939 ; CI-NEXT: v_min_u32_e32 v0, v0, v2
1940 ; CI-NEXT: v_min_u32_e32 v2, v8, v9
1941 ; CI-NEXT: v_min_u32_e32 v1, v1, v3
1942 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1943 ; CI-NEXT: v_or_b32_e32 v0, v0, v2
1944 ; CI-NEXT: flat_store_short v[6:7], v1
1945 ; CI-NEXT: flat_store_dword v[4:5], v0
1948 ; VI-LABEL: v_test_umin_ule_v3i16:
1950 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1951 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1952 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1953 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1954 ; VI-NEXT: v_mov_b32_e32 v1, s3
1955 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1956 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1957 ; VI-NEXT: v_mov_b32_e32 v3, s5
1958 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1959 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1960 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1961 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1962 ; VI-NEXT: v_mov_b32_e32 v5, s1
1963 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
1964 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
1965 ; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v4
1966 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
1967 ; VI-NEXT: s_waitcnt vmcnt(0)
1968 ; VI-NEXT: v_min_u16_e32 v8, v0, v2
1969 ; VI-NEXT: v_min_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1970 ; VI-NEXT: v_min_u16_e32 v1, v1, v3
1971 ; VI-NEXT: v_or_b32_e32 v0, v8, v0
1972 ; VI-NEXT: flat_store_short v[6:7], v1
1973 ; VI-NEXT: flat_store_dword v[4:5], v0
1976 ; GFX9-LABEL: v_test_umin_ule_v3i16:
1978 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1979 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1980 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1981 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1982 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
1983 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
1984 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1985 ; GFX9-NEXT: v_pk_min_u16 v1, v1, v3
1986 ; GFX9-NEXT: v_pk_min_u16 v0, v0, v2
1987 ; GFX9-NEXT: global_store_short v4, v1, s[0:1] offset:4
1988 ; GFX9-NEXT: global_store_dword v4, v0, s[0:1]
1989 ; GFX9-NEXT: s_endpgm
1991 ; GFX10-LABEL: v_test_umin_ule_v3i16:
1993 ; GFX10-NEXT: s_clause 0x1
1994 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1995 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
1996 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1997 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1998 ; GFX10-NEXT: s_clause 0x1
1999 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
2000 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
2001 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2002 ; GFX10-NEXT: v_pk_min_u16 v1, v1, v3
2003 ; GFX10-NEXT: v_pk_min_u16 v0, v0, v2
2004 ; GFX10-NEXT: global_store_short v4, v1, s[0:1] offset:4
2005 ; GFX10-NEXT: global_store_dword v4, v0, s[0:1]
2006 ; GFX10-NEXT: s_endpgm
2008 ; GFX11-LABEL: v_test_umin_ule_v3i16:
2010 ; GFX11-NEXT: s_clause 0x1
2011 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
2012 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
2013 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2014 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2015 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
2016 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2017 ; GFX11-NEXT: s_clause 0x1
2018 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
2019 ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1]
2020 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2021 ; GFX11-NEXT: v_pk_min_u16 v1, v1, v3
2022 ; GFX11-NEXT: v_pk_min_u16 v0, v0, v2
2023 ; GFX11-NEXT: s_clause 0x1
2024 ; GFX11-NEXT: global_store_b16 v4, v1, s[4:5] offset:4
2025 ; GFX11-NEXT: global_store_b32 v4, v0, s[4:5]
2026 ; GFX11-NEXT: s_nop 0
2027 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2028 ; GFX11-NEXT: s_endpgm
2029 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2030 %a.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
2031 %b.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
2032 %out.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %out, i32 %tid
2034 %a = load <3 x i16>, ptr addrspace(1) %a.gep
2035 %b = load <3 x i16>, ptr addrspace(1) %b.gep
2036 %cmp = icmp ule <3 x i16> %a, %b
2037 %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
2038 store <3 x i16> %val, ptr addrspace(1) %out.gep
2042 define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
2043 ; EG-LABEL: s_test_umin_ule_i32:
2045 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
2046 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2049 ; EG-NEXT: ALU clause starting at 4:
2050 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
2051 ; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
2052 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2054 ; CI-LABEL: s_test_umin_ule_i32:
2056 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2057 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2058 ; CI-NEXT: s_min_u32 s2, s2, s3
2059 ; CI-NEXT: v_mov_b32_e32 v0, s0
2060 ; CI-NEXT: v_mov_b32_e32 v1, s1
2061 ; CI-NEXT: v_mov_b32_e32 v2, s2
2062 ; CI-NEXT: flat_store_dword v[0:1], v2
2065 ; VI-LABEL: s_test_umin_ule_i32:
2067 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2068 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2069 ; VI-NEXT: s_min_u32 s2, s2, s3
2070 ; VI-NEXT: v_mov_b32_e32 v0, s0
2071 ; VI-NEXT: v_mov_b32_e32 v1, s1
2072 ; VI-NEXT: v_mov_b32_e32 v2, s2
2073 ; VI-NEXT: flat_store_dword v[0:1], v2
2076 ; GFX9-LABEL: s_test_umin_ule_i32:
2078 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2079 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2080 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2081 ; GFX9-NEXT: s_min_u32 s2, s2, s3
2082 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2083 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2084 ; GFX9-NEXT: s_endpgm
2086 ; GFX10-LABEL: s_test_umin_ule_i32:
2088 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2089 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2090 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2091 ; GFX10-NEXT: s_min_u32 s2, s2, s3
2092 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
2093 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2094 ; GFX10-NEXT: s_endpgm
2096 ; GFX11-LABEL: s_test_umin_ule_i32:
2098 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2099 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2100 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2101 ; GFX11-NEXT: s_min_u32 s2, s2, s3
2102 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2103 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
2104 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2105 ; GFX11-NEXT: s_nop 0
2106 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2107 ; GFX11-NEXT: s_endpgm
2108 %cmp = icmp ule i32 %a, %b
2109 %val = select i1 %cmp, i32 %a, i32 %b
2110 store i32 %val, ptr addrspace(1) %out, align 4
2114 define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
2115 ; EG-LABEL: v_test_umin_ult_i32:
2117 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
2119 ; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
2120 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2123 ; EG-NEXT: Fetch clause starting at 6:
2124 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
2125 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
2126 ; EG-NEXT: ALU clause starting at 10:
2127 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
2128 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2129 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
2130 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
2131 ; EG-NEXT: ALU clause starting at 14:
2132 ; EG-NEXT: MIN_UINT T0.X, T0.X, T1.X,
2133 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
2134 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
2135 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2137 ; CI-LABEL: v_test_umin_ult_i32:
2139 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2140 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
2141 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
2142 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2143 ; CI-NEXT: v_mov_b32_e32 v1, s3
2144 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
2145 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2146 ; CI-NEXT: v_mov_b32_e32 v3, s5
2147 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
2148 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2149 ; CI-NEXT: flat_load_dword v5, v[0:1]
2150 ; CI-NEXT: flat_load_dword v2, v[2:3]
2151 ; CI-NEXT: v_mov_b32_e32 v1, s1
2152 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
2153 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2154 ; CI-NEXT: s_waitcnt vmcnt(0)
2155 ; CI-NEXT: v_min_u32_e32 v2, v5, v2
2156 ; CI-NEXT: flat_store_dword v[0:1], v2
2159 ; VI-LABEL: v_test_umin_ult_i32:
2161 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2162 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
2163 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
2164 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2165 ; VI-NEXT: v_mov_b32_e32 v1, s3
2166 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
2167 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2168 ; VI-NEXT: v_mov_b32_e32 v3, s5
2169 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
2170 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2171 ; VI-NEXT: flat_load_dword v5, v[0:1]
2172 ; VI-NEXT: flat_load_dword v2, v[2:3]
2173 ; VI-NEXT: v_mov_b32_e32 v1, s1
2174 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
2175 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2176 ; VI-NEXT: s_waitcnt vmcnt(0)
2177 ; VI-NEXT: v_min_u32_e32 v2, v5, v2
2178 ; VI-NEXT: flat_store_dword v[0:1], v2
2181 ; GFX9-LABEL: v_test_umin_ult_i32:
2183 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2184 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
2185 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2186 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2187 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2188 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
2189 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2190 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2
2191 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2192 ; GFX9-NEXT: s_endpgm
2194 ; GFX10-LABEL: v_test_umin_ult_i32:
2196 ; GFX10-NEXT: s_clause 0x1
2197 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2198 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
2199 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2201 ; GFX10-NEXT: s_clause 0x1
2202 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2203 ; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
2204 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2205 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2
2206 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2207 ; GFX10-NEXT: s_endpgm
2209 ; GFX11-LABEL: v_test_umin_ult_i32:
2211 ; GFX11-NEXT: s_clause 0x1
2212 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
2213 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
2214 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2215 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2216 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2217 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2218 ; GFX11-NEXT: s_clause 0x1
2219 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
2220 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1]
2221 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2222 ; GFX11-NEXT: v_min_u32_e32 v1, v1, v2
2223 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
2224 ; GFX11-NEXT: s_nop 0
2225 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2226 ; GFX11-NEXT: s_endpgm
2227 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2228 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
2229 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
2230 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
2231 %a = load i32, ptr addrspace(1) %a.gep, align 4
2232 %b = load i32, ptr addrspace(1) %b.gep, align 4
2233 %cmp = icmp ult i32 %a, %b
2234 %val = select i1 %cmp, i32 %a, i32 %b
2235 store i32 %val, ptr addrspace(1) %out.gep, align 4
2239 define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
2240 ; EG-LABEL: v_test_umin_ult_i8:
2242 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
2244 ; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[]
2245 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X
2248 ; EG-NEXT: Fetch clause starting at 6:
2249 ; EG-NEXT: VTX_READ_8 T2.X, T2.X, 0, #1
2250 ; EG-NEXT: VTX_READ_8 T1.X, T1.X, 0, #1
2251 ; EG-NEXT: ALU clause starting at 10:
2252 ; EG-NEXT: ADD_INT T1.X, KC0[2].Z, T0.X,
2253 ; EG-NEXT: ADD_INT * T2.X, KC0[2].W, T0.X,
2254 ; EG-NEXT: ALU clause starting at 12:
2255 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.X,
2256 ; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
2257 ; EG-NEXT: MIN_UINT * T2.W, T1.X, T2.X,
2258 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2259 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
2260 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2261 ; EG-NEXT: LSHL T1.X, T2.W, PV.W,
2262 ; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
2263 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2264 ; EG-NEXT: MOV T1.Y, 0.0,
2265 ; EG-NEXT: MOV * T1.Z, 0.0,
2266 ; EG-NEXT: LSHR * T0.X, T0.W, literal.x,
2267 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2269 ; CI-LABEL: v_test_umin_ult_i8:
2271 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2272 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
2273 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2274 ; CI-NEXT: v_mov_b32_e32 v2, s3
2275 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
2276 ; CI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
2277 ; CI-NEXT: v_mov_b32_e32 v4, s5
2278 ; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v0
2279 ; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
2280 ; CI-NEXT: flat_load_ubyte v2, v[1:2]
2281 ; CI-NEXT: flat_load_ubyte v3, v[3:4]
2282 ; CI-NEXT: v_mov_b32_e32 v1, s1
2283 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
2284 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2285 ; CI-NEXT: s_waitcnt vmcnt(0)
2286 ; CI-NEXT: v_min_u32_e32 v2, v2, v3
2287 ; CI-NEXT: flat_store_byte v[0:1], v2
2290 ; VI-LABEL: v_test_umin_ult_i8:
2292 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2293 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
2294 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2295 ; VI-NEXT: v_mov_b32_e32 v2, s3
2296 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0
2297 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
2298 ; VI-NEXT: v_mov_b32_e32 v4, s5
2299 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v0
2300 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
2301 ; VI-NEXT: flat_load_ubyte v2, v[1:2]
2302 ; VI-NEXT: flat_load_ubyte v3, v[3:4]
2303 ; VI-NEXT: v_mov_b32_e32 v1, s1
2304 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2305 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2306 ; VI-NEXT: s_waitcnt vmcnt(0)
2307 ; VI-NEXT: v_min_u16_e32 v2, v2, v3
2308 ; VI-NEXT: flat_store_byte v[0:1], v2
2311 ; GFX9-LABEL: v_test_umin_ult_i8:
2313 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2314 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
2315 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2316 ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
2317 ; GFX9-NEXT: global_load_ubyte v2, v0, s[4:5]
2318 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2319 ; GFX9-NEXT: v_min_u16_e32 v1, v1, v2
2320 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
2321 ; GFX9-NEXT: s_endpgm
2323 ; GFX10-LABEL: v_test_umin_ult_i8:
2325 ; GFX10-NEXT: s_clause 0x1
2326 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2327 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
2328 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2329 ; GFX10-NEXT: s_clause 0x1
2330 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
2331 ; GFX10-NEXT: global_load_ubyte v2, v0, s[4:5]
2332 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2333 ; GFX10-NEXT: v_min_u16 v1, v1, v2
2334 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
2335 ; GFX10-NEXT: s_endpgm
2337 ; GFX11-LABEL: v_test_umin_ult_i8:
2339 ; GFX11-NEXT: s_clause 0x1
2340 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
2341 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
2342 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2343 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2344 ; GFX11-NEXT: s_clause 0x1
2345 ; GFX11-NEXT: global_load_u8 v1, v0, s[6:7]
2346 ; GFX11-NEXT: global_load_u8 v2, v0, s[0:1]
2347 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2348 ; GFX11-NEXT: v_min_u16 v1, v1, v2
2349 ; GFX11-NEXT: global_store_b8 v0, v1, s[4:5]
2350 ; GFX11-NEXT: s_nop 0
2351 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2352 ; GFX11-NEXT: s_endpgm
2353 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2354 %a.gep = getelementptr inbounds i8, ptr addrspace(1) %a.ptr, i32 %tid
2355 %b.gep = getelementptr inbounds i8, ptr addrspace(1) %b.ptr, i32 %tid
2356 %out.gep = getelementptr inbounds i8, ptr addrspace(1) %out, i32 %tid
2358 %a = load i8, ptr addrspace(1) %a.gep, align 1
2359 %b = load i8, ptr addrspace(1) %b.gep, align 1
2360 %cmp = icmp ult i8 %a, %b
2361 %val = select i1 %cmp, i8 %a, i8 %b
2362 store i8 %val, ptr addrspace(1) %out.gep, align 1
2366 define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
2367 ; EG-LABEL: s_test_umin_ult_i32:
2369 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
2370 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2373 ; EG-NEXT: ALU clause starting at 4:
2374 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
2375 ; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
2376 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2378 ; CI-LABEL: s_test_umin_ult_i32:
2380 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2381 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2382 ; CI-NEXT: s_min_u32 s2, s2, s3
2383 ; CI-NEXT: v_mov_b32_e32 v0, s0
2384 ; CI-NEXT: v_mov_b32_e32 v1, s1
2385 ; CI-NEXT: v_mov_b32_e32 v2, s2
2386 ; CI-NEXT: flat_store_dword v[0:1], v2
2389 ; VI-LABEL: s_test_umin_ult_i32:
2391 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2392 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2393 ; VI-NEXT: s_min_u32 s2, s2, s3
2394 ; VI-NEXT: v_mov_b32_e32 v0, s0
2395 ; VI-NEXT: v_mov_b32_e32 v1, s1
2396 ; VI-NEXT: v_mov_b32_e32 v2, s2
2397 ; VI-NEXT: flat_store_dword v[0:1], v2
2400 ; GFX9-LABEL: s_test_umin_ult_i32:
2402 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2403 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2404 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2405 ; GFX9-NEXT: s_min_u32 s2, s2, s3
2406 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2407 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2408 ; GFX9-NEXT: s_endpgm
2410 ; GFX10-LABEL: s_test_umin_ult_i32:
2412 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2413 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2414 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2415 ; GFX10-NEXT: s_min_u32 s2, s2, s3
2416 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
2417 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2418 ; GFX10-NEXT: s_endpgm
2420 ; GFX11-LABEL: s_test_umin_ult_i32:
2422 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2423 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2424 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2425 ; GFX11-NEXT: s_min_u32 s2, s2, s3
2426 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2427 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
2428 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2429 ; GFX11-NEXT: s_nop 0
2430 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2431 ; GFX11-NEXT: s_endpgm
2432 %cmp = icmp ult i32 %a, %b
2433 %val = select i1 %cmp, i32 %a, i32 %b
2434 store i32 %val, ptr addrspace(1) %out, align 4
2438 define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
2439 ; EG-LABEL: v_test_umin_ult_i32_multi_use:
2441 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
2443 ; EG-NEXT: ALU 16, @12, KC0[CB0:0-32], KC1[]
2444 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 0
2445 ; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
2447 ; EG-NEXT: Fetch clause starting at 6:
2448 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
2449 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
2450 ; EG-NEXT: ALU clause starting at 10:
2451 ; EG-NEXT: MOV T0.X, KC0[2].W,
2452 ; EG-NEXT: MOV * T1.X, KC0[3].X,
2453 ; EG-NEXT: ALU clause starting at 12:
2454 ; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x,
2455 ; EG-NEXT: SETGT_UINT * T1.W, T1.X, T0.X,
2456 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2457 ; EG-NEXT: AND_INT T1.W, PS, 1,
2458 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2459 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2460 ; EG-NEXT: LSHL T2.X, PV.W, PS,
2461 ; EG-NEXT: LSHL * T2.W, literal.x, PS,
2462 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2463 ; EG-NEXT: MOV T2.Y, 0.0,
2464 ; EG-NEXT: MOV * T2.Z, 0.0,
2465 ; EG-NEXT: LSHR T3.X, KC0[2].Z, literal.x,
2466 ; EG-NEXT: SETGE_UINT * T0.W, T0.X, T1.X,
2467 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2468 ; EG-NEXT: CNDE_INT T0.X, PV.W, T0.X, T1.X,
2469 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2470 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2472 ; CI-LABEL: v_test_umin_ult_i32_multi_use:
2474 ; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0
2475 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2476 ; CI-NEXT: s_load_dword s4, s[4:5], 0x0
2477 ; CI-NEXT: s_load_dword s5, s[6:7], 0x0
2478 ; CI-NEXT: v_mov_b32_e32 v0, s0
2479 ; CI-NEXT: v_mov_b32_e32 v1, s1
2480 ; CI-NEXT: v_mov_b32_e32 v2, s2
2481 ; CI-NEXT: v_mov_b32_e32 v3, s3
2482 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2483 ; CI-NEXT: s_cmp_lt_u32 s4, s5
2484 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
2485 ; CI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
2486 ; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
2487 ; CI-NEXT: s_cselect_b32 s0, s4, s5
2488 ; CI-NEXT: v_mov_b32_e32 v5, s0
2489 ; CI-NEXT: flat_store_dword v[0:1], v5
2490 ; CI-NEXT: flat_store_byte v[2:3], v4
2493 ; VI-LABEL: v_test_umin_ult_i32_multi_use:
2495 ; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0
2496 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2497 ; VI-NEXT: s_load_dword s4, s[4:5], 0x0
2498 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0
2499 ; VI-NEXT: v_mov_b32_e32 v0, s0
2500 ; VI-NEXT: v_mov_b32_e32 v1, s1
2501 ; VI-NEXT: v_mov_b32_e32 v2, s2
2502 ; VI-NEXT: v_mov_b32_e32 v3, s3
2503 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2504 ; VI-NEXT: s_cmp_lt_u32 s4, s5
2505 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
2506 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
2507 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
2508 ; VI-NEXT: s_cselect_b32 s0, s4, s5
2509 ; VI-NEXT: v_mov_b32_e32 v5, s0
2510 ; VI-NEXT: flat_store_dword v[0:1], v5
2511 ; VI-NEXT: flat_store_byte v[2:3], v4
2514 ; GFX9-LABEL: v_test_umin_ult_i32_multi_use:
2516 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
2517 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2518 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2519 ; GFX9-NEXT: s_load_dword s2, s[12:13], 0x0
2520 ; GFX9-NEXT: s_load_dword s3, s[14:15], 0x0
2521 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2522 ; GFX9-NEXT: s_cmp_lt_u32 s2, s3
2523 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
2524 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
2525 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
2526 ; GFX9-NEXT: s_cselect_b32 s0, s2, s3
2527 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
2528 ; GFX9-NEXT: global_store_dword v0, v2, s[8:9]
2529 ; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
2530 ; GFX9-NEXT: s_endpgm
2532 ; GFX10-LABEL: v_test_umin_ult_i32_multi_use:
2534 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
2535 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
2536 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2537 ; GFX10-NEXT: s_load_dword s0, s[12:13], 0x0
2538 ; GFX10-NEXT: s_load_dword s1, s[14:15], 0x0
2539 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2540 ; GFX10-NEXT: s_cmp_lt_u32 s0, s1
2541 ; GFX10-NEXT: s_cselect_b32 s2, -1, 0
2542 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
2543 ; GFX10-NEXT: s_and_b32 s2, s2, exec_lo
2544 ; GFX10-NEXT: s_cselect_b32 s0, s0, s1
2545 ; GFX10-NEXT: v_mov_b32_e32 v2, s0
2546 ; GFX10-NEXT: global_store_dword v1, v2, s[8:9]
2547 ; GFX10-NEXT: global_store_byte v1, v0, s[10:11]
2548 ; GFX10-NEXT: s_endpgm
2550 ; GFX11-LABEL: v_test_umin_ult_i32_multi_use:
2552 ; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0
2553 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
2554 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2555 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
2556 ; GFX11-NEXT: s_load_b32 s5, s[6:7], 0x0
2557 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2558 ; GFX11-NEXT: s_cmp_lt_u32 s4, s5
2559 ; GFX11-NEXT: s_cselect_b32 s6, -1, 0
2560 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
2561 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
2562 ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
2563 ; GFX11-NEXT: s_cselect_b32 s4, s4, s5
2564 ; GFX11-NEXT: v_mov_b32_e32 v2, s4
2565 ; GFX11-NEXT: s_clause 0x1
2566 ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
2567 ; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
2568 ; GFX11-NEXT: s_nop 0
2569 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2570 ; GFX11-NEXT: s_endpgm
2571 %a = load i32, ptr addrspace(1) %aptr, align 4
2572 %b = load i32, ptr addrspace(1) %bptr, align 4
2573 %cmp = icmp ult i32 %a, %b
2574 %val = select i1 %cmp, i32 %a, i32 %b
2575 store i32 %val, ptr addrspace(1) %out0, align 4
2576 store i1 %cmp, ptr addrspace(1) %out1
2580 define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
2581 ; EG-LABEL: v_test_umin_ult_i16_multi_use:
2583 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
2585 ; EG-NEXT: ALU 24, @12, KC0[CB0:0-32], KC1[]
2586 ; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
2587 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
2589 ; EG-NEXT: Fetch clause starting at 6:
2590 ; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
2591 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
2592 ; EG-NEXT: ALU clause starting at 10:
2593 ; EG-NEXT: MOV T0.X, KC0[2].W,
2594 ; EG-NEXT: MOV * T1.X, KC0[3].X,
2595 ; EG-NEXT: ALU clause starting at 12:
2596 ; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
2597 ; EG-NEXT: SETGE_UINT * T1.W, T0.X, T1.X,
2598 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2599 ; EG-NEXT: CNDE_INT T1.W, PS, T0.X, T1.X,
2600 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2601 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2602 ; EG-NEXT: LSHL T2.X, PV.W, PS,
2603 ; EG-NEXT: LSHL * T2.W, literal.x, PS,
2604 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2605 ; EG-NEXT: MOV T2.Y, 0.0,
2606 ; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x,
2607 ; EG-NEXT: SETGT_UINT * T1.W, T1.X, T0.X,
2608 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2609 ; EG-NEXT: AND_INT T1.W, PS, 1,
2610 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2611 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2612 ; EG-NEXT: LSHL T0.X, PV.W, PS,
2613 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
2614 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2615 ; EG-NEXT: MOV T0.Y, 0.0,
2616 ; EG-NEXT: MOV T2.Z, 0.0,
2617 ; EG-NEXT: MOV * T0.Z, 0.0,
2618 ; EG-NEXT: LSHR T1.X, KC0[2].Z, literal.x,
2619 ; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
2620 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2622 ; CI-LABEL: v_test_umin_ult_i16_multi_use:
2624 ; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0
2625 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2626 ; CI-NEXT: v_mov_b32_e32 v0, s4
2627 ; CI-NEXT: v_mov_b32_e32 v1, s5
2628 ; CI-NEXT: v_mov_b32_e32 v2, s6
2629 ; CI-NEXT: v_mov_b32_e32 v3, s7
2630 ; CI-NEXT: flat_load_ushort v4, v[0:1]
2631 ; CI-NEXT: flat_load_ushort v5, v[2:3]
2632 ; CI-NEXT: v_mov_b32_e32 v0, s0
2633 ; CI-NEXT: v_mov_b32_e32 v1, s1
2634 ; CI-NEXT: v_mov_b32_e32 v2, s2
2635 ; CI-NEXT: v_mov_b32_e32 v3, s3
2636 ; CI-NEXT: s_waitcnt vmcnt(0)
2637 ; CI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5
2638 ; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
2639 ; CI-NEXT: flat_store_short v[0:1], v4
2640 ; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
2641 ; CI-NEXT: flat_store_byte v[2:3], v0
2644 ; VI-LABEL: v_test_umin_ult_i16_multi_use:
2646 ; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0
2647 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2648 ; VI-NEXT: v_mov_b32_e32 v0, s4
2649 ; VI-NEXT: v_mov_b32_e32 v1, s5
2650 ; VI-NEXT: v_mov_b32_e32 v2, s6
2651 ; VI-NEXT: v_mov_b32_e32 v3, s7
2652 ; VI-NEXT: flat_load_ushort v4, v[0:1]
2653 ; VI-NEXT: flat_load_ushort v5, v[2:3]
2654 ; VI-NEXT: v_mov_b32_e32 v0, s0
2655 ; VI-NEXT: v_mov_b32_e32 v1, s1
2656 ; VI-NEXT: v_mov_b32_e32 v2, s2
2657 ; VI-NEXT: v_mov_b32_e32 v3, s3
2658 ; VI-NEXT: s_waitcnt vmcnt(0)
2659 ; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5
2660 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
2661 ; VI-NEXT: flat_store_short v[0:1], v4
2662 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
2663 ; VI-NEXT: flat_store_byte v[2:3], v0
2666 ; GFX9-LABEL: v_test_umin_ult_i16_multi_use:
2668 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
2669 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2670 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2671 ; GFX9-NEXT: global_load_ushort v1, v0, s[12:13]
2672 ; GFX9-NEXT: global_load_ushort v2, v0, s[14:15]
2673 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2674 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
2675 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
2676 ; GFX9-NEXT: global_store_short v0, v1, s[8:9]
2677 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
2678 ; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
2679 ; GFX9-NEXT: s_endpgm
2681 ; GFX10-LABEL: v_test_umin_ult_i16_multi_use:
2683 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
2684 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2685 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2686 ; GFX10-NEXT: s_clause 0x1
2687 ; GFX10-NEXT: global_load_ushort v1, v0, s[12:13]
2688 ; GFX10-NEXT: global_load_ushort v2, v0, s[14:15]
2689 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2690 ; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
2691 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
2692 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
2693 ; GFX10-NEXT: global_store_short v0, v1, s[8:9]
2694 ; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
2695 ; GFX10-NEXT: s_endpgm
2697 ; GFX11-LABEL: v_test_umin_ult_i16_multi_use:
2699 ; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0
2700 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2701 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2702 ; GFX11-NEXT: s_clause 0x1
2703 ; GFX11-NEXT: global_load_u16 v1, v0, s[4:5]
2704 ; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
2705 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2706 ; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
2707 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
2708 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
2709 ; GFX11-NEXT: s_clause 0x1
2710 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
2711 ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
2712 ; GFX11-NEXT: s_nop 0
2713 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2714 ; GFX11-NEXT: s_endpgm
2715 %a = load i16, ptr addrspace(1) %aptr, align 2
2716 %b = load i16, ptr addrspace(1) %bptr, align 2
2717 %cmp = icmp ult i16 %a, %b
2718 %val = select i1 %cmp, i16 %a, i16 %b
2719 store i16 %val, ptr addrspace(1) %out0, align 2
2720 store i1 %cmp, ptr addrspace(1) %out1
2724 define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
2725 ; EG-LABEL: s_test_umin_ult_v1i32:
2727 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
2728 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2731 ; EG-NEXT: ALU clause starting at 4:
2732 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
2733 ; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
2734 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2736 ; CI-LABEL: s_test_umin_ult_v1i32:
2738 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2739 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2740 ; CI-NEXT: s_min_u32 s2, s2, s3
2741 ; CI-NEXT: v_mov_b32_e32 v0, s0
2742 ; CI-NEXT: v_mov_b32_e32 v1, s1
2743 ; CI-NEXT: v_mov_b32_e32 v2, s2
2744 ; CI-NEXT: flat_store_dword v[0:1], v2
2747 ; VI-LABEL: s_test_umin_ult_v1i32:
2749 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2750 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2751 ; VI-NEXT: s_min_u32 s2, s2, s3
2752 ; VI-NEXT: v_mov_b32_e32 v0, s0
2753 ; VI-NEXT: v_mov_b32_e32 v1, s1
2754 ; VI-NEXT: v_mov_b32_e32 v2, s2
2755 ; VI-NEXT: flat_store_dword v[0:1], v2
2758 ; GFX9-LABEL: s_test_umin_ult_v1i32:
2760 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2761 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2762 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2763 ; GFX9-NEXT: s_min_u32 s2, s2, s3
2764 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2765 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2766 ; GFX9-NEXT: s_endpgm
2768 ; GFX10-LABEL: s_test_umin_ult_v1i32:
2770 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2771 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2772 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2773 ; GFX10-NEXT: s_min_u32 s2, s2, s3
2774 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
2775 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2776 ; GFX10-NEXT: s_endpgm
2778 ; GFX11-LABEL: s_test_umin_ult_v1i32:
2780 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2781 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2782 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2783 ; GFX11-NEXT: s_min_u32 s2, s2, s3
2784 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2785 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
2786 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2787 ; GFX11-NEXT: s_nop 0
2788 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2789 ; GFX11-NEXT: s_endpgm
2790 %cmp = icmp ult <1 x i32> %a, %b
2791 %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
2792 store <1 x i32> %val, ptr addrspace(1) %out
2796 define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) #0 {
2797 ; EG-LABEL: s_test_umin_ult_v8i32:
2799 ; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
2800 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
2801 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2803 ; EG-NEXT: ALU clause starting at 4:
2804 ; EG-NEXT: MIN_UINT * T0.W, KC0[5].X, KC0[7].X,
2805 ; EG-NEXT: MIN_UINT * T0.Z, KC0[4].W, KC0[6].W,
2806 ; EG-NEXT: MIN_UINT * T0.Y, KC0[4].Z, KC0[6].Z,
2807 ; EG-NEXT: MIN_UINT * T0.X, KC0[4].Y, KC0[6].Y,
2808 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2809 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2810 ; EG-NEXT: MIN_UINT * T2.W, KC0[6].X, KC0[8].X,
2811 ; EG-NEXT: MIN_UINT * T2.Z, KC0[5].W, KC0[7].W,
2812 ; EG-NEXT: MIN_UINT * T2.Y, KC0[5].Z, KC0[7].Z,
2813 ; EG-NEXT: MIN_UINT * T2.X, KC0[5].Y, KC0[7].Y,
2814 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
2815 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2816 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
2817 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2819 ; CI-LABEL: s_test_umin_ult_v8i32:
2821 ; CI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8
2822 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
2823 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2824 ; CI-NEXT: s_min_u32 s4, s11, s19
2825 ; CI-NEXT: s_min_u32 s5, s10, s18
2826 ; CI-NEXT: s_min_u32 s6, s9, s17
2827 ; CI-NEXT: s_min_u32 s7, s8, s16
2828 ; CI-NEXT: s_min_u32 s2, s15, s23
2829 ; CI-NEXT: s_min_u32 s3, s14, s22
2830 ; CI-NEXT: s_min_u32 s8, s13, s21
2831 ; CI-NEXT: s_min_u32 s9, s12, s20
2832 ; CI-NEXT: v_mov_b32_e32 v3, s2
2833 ; CI-NEXT: s_add_u32 s2, s0, 16
2834 ; CI-NEXT: v_mov_b32_e32 v2, s3
2835 ; CI-NEXT: s_addc_u32 s3, s1, 0
2836 ; CI-NEXT: v_mov_b32_e32 v5, s3
2837 ; CI-NEXT: v_mov_b32_e32 v0, s9
2838 ; CI-NEXT: v_mov_b32_e32 v1, s8
2839 ; CI-NEXT: v_mov_b32_e32 v4, s2
2840 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2841 ; CI-NEXT: v_mov_b32_e32 v5, s1
2842 ; CI-NEXT: v_mov_b32_e32 v0, s7
2843 ; CI-NEXT: v_mov_b32_e32 v1, s6
2844 ; CI-NEXT: v_mov_b32_e32 v2, s5
2845 ; CI-NEXT: v_mov_b32_e32 v3, s4
2846 ; CI-NEXT: v_mov_b32_e32 v4, s0
2847 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2850 ; VI-LABEL: s_test_umin_ult_v8i32:
2852 ; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20
2853 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
2854 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2855 ; VI-NEXT: s_min_u32 s4, s11, s19
2856 ; VI-NEXT: s_min_u32 s5, s10, s18
2857 ; VI-NEXT: s_min_u32 s6, s9, s17
2858 ; VI-NEXT: s_min_u32 s7, s8, s16
2859 ; VI-NEXT: s_min_u32 s2, s15, s23
2860 ; VI-NEXT: s_min_u32 s3, s14, s22
2861 ; VI-NEXT: s_min_u32 s8, s13, s21
2862 ; VI-NEXT: s_min_u32 s9, s12, s20
2863 ; VI-NEXT: v_mov_b32_e32 v3, s2
2864 ; VI-NEXT: s_add_u32 s2, s0, 16
2865 ; VI-NEXT: v_mov_b32_e32 v2, s3
2866 ; VI-NEXT: s_addc_u32 s3, s1, 0
2867 ; VI-NEXT: v_mov_b32_e32 v5, s3
2868 ; VI-NEXT: v_mov_b32_e32 v0, s9
2869 ; VI-NEXT: v_mov_b32_e32 v1, s8
2870 ; VI-NEXT: v_mov_b32_e32 v4, s2
2871 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2872 ; VI-NEXT: v_mov_b32_e32 v5, s1
2873 ; VI-NEXT: v_mov_b32_e32 v0, s7
2874 ; VI-NEXT: v_mov_b32_e32 v1, s6
2875 ; VI-NEXT: v_mov_b32_e32 v2, s5
2876 ; VI-NEXT: v_mov_b32_e32 v3, s4
2877 ; VI-NEXT: v_mov_b32_e32 v4, s0
2878 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2881 ; GFX9-LABEL: s_test_umin_ult_v8i32:
2883 ; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20
2884 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
2885 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2886 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2887 ; GFX9-NEXT: s_min_u32 s4, s9, s17
2888 ; GFX9-NEXT: s_min_u32 s5, s8, s16
2889 ; GFX9-NEXT: s_min_u32 s6, s15, s23
2890 ; GFX9-NEXT: s_min_u32 s7, s14, s22
2891 ; GFX9-NEXT: s_min_u32 s8, s13, s21
2892 ; GFX9-NEXT: s_min_u32 s9, s12, s20
2893 ; GFX9-NEXT: s_min_u32 s2, s11, s19
2894 ; GFX9-NEXT: s_min_u32 s3, s10, s18
2895 ; GFX9-NEXT: v_mov_b32_e32 v0, s9
2896 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
2897 ; GFX9-NEXT: v_mov_b32_e32 v2, s7
2898 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
2899 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
2900 ; GFX9-NEXT: s_nop 0
2901 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
2902 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
2903 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
2904 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
2905 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
2906 ; GFX9-NEXT: s_endpgm
2908 ; GFX10-LABEL: s_test_umin_ult_v8i32:
2910 ; GFX10-NEXT: s_clause 0x1
2911 ; GFX10-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20
2912 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
2913 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
2914 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2915 ; GFX10-NEXT: s_min_u32 s4, s9, s17
2916 ; GFX10-NEXT: s_min_u32 s5, s8, s16
2917 ; GFX10-NEXT: s_min_u32 s6, s15, s23
2918 ; GFX10-NEXT: s_min_u32 s7, s14, s22
2919 ; GFX10-NEXT: s_min_u32 s8, s12, s20
2920 ; GFX10-NEXT: s_min_u32 s9, s13, s21
2921 ; GFX10-NEXT: s_min_u32 s2, s11, s19
2922 ; GFX10-NEXT: s_min_u32 s3, s10, s18
2923 ; GFX10-NEXT: v_mov_b32_e32 v0, s8
2924 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
2925 ; GFX10-NEXT: v_mov_b32_e32 v2, s7
2926 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
2927 ; GFX10-NEXT: v_mov_b32_e32 v4, s5
2928 ; GFX10-NEXT: v_mov_b32_e32 v5, s4
2929 ; GFX10-NEXT: v_mov_b32_e32 v6, s3
2930 ; GFX10-NEXT: v_mov_b32_e32 v7, s2
2931 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
2932 ; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
2933 ; GFX10-NEXT: s_endpgm
2935 ; GFX11-LABEL: s_test_umin_ult_v8i32:
2937 ; GFX11-NEXT: s_clause 0x1
2938 ; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x20
2939 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
2940 ; GFX11-NEXT: v_mov_b32_e32 v8, 0
2941 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2942 ; GFX11-NEXT: s_min_u32 s2, s7, s15
2943 ; GFX11-NEXT: s_min_u32 s3, s6, s14
2944 ; GFX11-NEXT: s_min_u32 s6, s11, s19
2945 ; GFX11-NEXT: s_min_u32 s7, s10, s18
2946 ; GFX11-NEXT: s_min_u32 s8, s8, s16
2947 ; GFX11-NEXT: s_min_u32 s9, s9, s17
2948 ; GFX11-NEXT: s_min_u32 s5, s5, s13
2949 ; GFX11-NEXT: s_min_u32 s4, s4, s12
2950 ; GFX11-NEXT: v_mov_b32_e32 v0, s8
2951 ; GFX11-NEXT: v_mov_b32_e32 v1, s9
2952 ; GFX11-NEXT: v_mov_b32_e32 v2, s7
2953 ; GFX11-NEXT: v_mov_b32_e32 v3, s6
2954 ; GFX11-NEXT: v_mov_b32_e32 v4, s4
2955 ; GFX11-NEXT: v_mov_b32_e32 v5, s5
2956 ; GFX11-NEXT: v_mov_b32_e32 v6, s3
2957 ; GFX11-NEXT: v_mov_b32_e32 v7, s2
2958 ; GFX11-NEXT: s_clause 0x1
2959 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
2960 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
2961 ; GFX11-NEXT: s_nop 0
2962 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2963 ; GFX11-NEXT: s_endpgm
2964 %cmp = icmp ult <8 x i32> %a, %b
2965 %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
2966 store <8 x i32> %val, ptr addrspace(1) %out
2970 define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16> %a, <8 x i16> %b) #0 {
2971 ; EG-LABEL: s_test_umin_ult_v8i16:
2973 ; EG-NEXT: ALU 1, @52, KC0[], KC1[]
2974 ; EG-NEXT: TEX 1 @20
2975 ; EG-NEXT: ALU 9, @54, KC0[], KC1[]
2976 ; EG-NEXT: TEX 1 @24
2977 ; EG-NEXT: ALU 8, @64, KC0[], KC1[]
2978 ; EG-NEXT: TEX 1 @28
2979 ; EG-NEXT: ALU 10, @73, KC0[], KC1[]
2980 ; EG-NEXT: TEX 1 @32
2981 ; EG-NEXT: ALU 8, @84, KC0[], KC1[]
2982 ; EG-NEXT: TEX 1 @36
2983 ; EG-NEXT: ALU 10, @93, KC0[], KC1[]
2984 ; EG-NEXT: TEX 1 @40
2985 ; EG-NEXT: ALU 8, @104, KC0[], KC1[]
2986 ; EG-NEXT: TEX 1 @44
2987 ; EG-NEXT: ALU 10, @113, KC0[], KC1[]
2988 ; EG-NEXT: TEX 1 @48
2989 ; EG-NEXT: ALU 10, @124, KC0[CB0:0-32], KC1[]
2990 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
2993 ; EG-NEXT: Fetch clause starting at 20:
2994 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3
2995 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 82, #3
2996 ; EG-NEXT: Fetch clause starting at 24:
2997 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3
2998 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 80, #3
2999 ; EG-NEXT: Fetch clause starting at 28:
3000 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3
3001 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 78, #3
3002 ; EG-NEXT: Fetch clause starting at 32:
3003 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3
3004 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 76, #3
3005 ; EG-NEXT: Fetch clause starting at 36:
3006 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3
3007 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 74, #3
3008 ; EG-NEXT: Fetch clause starting at 40:
3009 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3
3010 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 72, #3
3011 ; EG-NEXT: Fetch clause starting at 44:
3012 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3
3013 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 70, #3
3014 ; EG-NEXT: Fetch clause starting at 48:
3015 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 52, #3
3016 ; EG-NEXT: VTX_READ_16 T7.X, T7.X, 68, #3
3017 ; EG-NEXT: ALU clause starting at 52:
3018 ; EG-NEXT: MOV * T0.Y, T3.X,
3019 ; EG-NEXT: MOV * T7.X, 0.0,
3020 ; EG-NEXT: ALU clause starting at 54:
3021 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3022 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3023 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3024 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3025 ; EG-NEXT: LSHL T0.W, PV.W, literal.x,
3026 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3027 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
3028 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
3029 ; EG-NEXT: MOV * T3.X, PV.W,
3030 ; EG-NEXT: MOV * T0.Y, PV.X,
3031 ; EG-NEXT: ALU clause starting at 64:
3032 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3033 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3034 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3035 ; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
3036 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3037 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
3038 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3039 ; EG-NEXT: MOV T3.X, PV.W,
3040 ; EG-NEXT: MOV * T0.Y, T2.X,
3041 ; EG-NEXT: ALU clause starting at 73:
3042 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3043 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3044 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3045 ; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
3046 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
3047 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3048 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3049 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3050 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3051 ; EG-NEXT: MOV * T2.X, PV.W,
3052 ; EG-NEXT: MOV * T0.Y, PV.X,
3053 ; EG-NEXT: ALU clause starting at 84:
3054 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3055 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3056 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3057 ; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
3058 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3059 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
3060 ; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
3061 ; EG-NEXT: MOV T2.X, PV.Z,
3062 ; EG-NEXT: MOV * T0.Y, T5.X,
3063 ; EG-NEXT: ALU clause starting at 93:
3064 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3065 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3066 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3067 ; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
3068 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
3069 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3070 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3071 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3072 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3073 ; EG-NEXT: MOV * T5.X, PV.W,
3074 ; EG-NEXT: MOV * T0.Y, PV.X,
3075 ; EG-NEXT: ALU clause starting at 104:
3076 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3077 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3078 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3079 ; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
3080 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3081 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
3082 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3083 ; EG-NEXT: MOV T5.X, PV.W,
3084 ; EG-NEXT: MOV * T0.Y, T4.X,
3085 ; EG-NEXT: ALU clause starting at 113:
3086 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3087 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3088 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3089 ; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
3090 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
3091 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3092 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3093 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3094 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3095 ; EG-NEXT: MOV * T4.X, PV.W,
3096 ; EG-NEXT: MOV * T0.Y, PV.X,
3097 ; EG-NEXT: ALU clause starting at 124:
3098 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3099 ; EG-NEXT: AND_INT * T1.W, T7.X, literal.x,
3100 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3101 ; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
3102 ; EG-NEXT: AND_INT T2.W, T0.Y, literal.y,
3103 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3104 ; EG-NEXT: 2(2.802597e-45), -65536(nan)
3105 ; EG-NEXT: OR_INT * T7.X, PV.W, PS,
3106 ; EG-NEXT: MOV T4.X, PV.X,
3107 ; EG-NEXT: MOV * T7.W, T3.X,
3108 ; EG-NEXT: MOV * T7.Y, T5.X,
3110 ; CI-LABEL: s_test_umin_ult_v8i16:
3112 ; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4
3113 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3114 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3115 ; CI-NEXT: s_lshr_b32 s2, s8, 16
3116 ; CI-NEXT: s_and_b32 s3, s8, 0xffff
3117 ; CI-NEXT: s_lshr_b32 s4, s9, 16
3118 ; CI-NEXT: s_and_b32 s5, s9, 0xffff
3119 ; CI-NEXT: s_lshr_b32 s6, s10, 16
3120 ; CI-NEXT: s_and_b32 s7, s10, 0xffff
3121 ; CI-NEXT: s_lshr_b32 s8, s11, 16
3122 ; CI-NEXT: s_and_b32 s9, s11, 0xffff
3123 ; CI-NEXT: s_lshr_b32 s10, s12, 16
3124 ; CI-NEXT: s_and_b32 s11, s12, 0xffff
3125 ; CI-NEXT: s_lshr_b32 s12, s13, 16
3126 ; CI-NEXT: s_lshr_b32 s16, s14, 16
3127 ; CI-NEXT: s_lshr_b32 s17, s15, 16
3128 ; CI-NEXT: s_and_b32 s13, s13, 0xffff
3129 ; CI-NEXT: s_and_b32 s14, s14, 0xffff
3130 ; CI-NEXT: s_and_b32 s15, s15, 0xffff
3131 ; CI-NEXT: s_min_u32 s8, s8, s17
3132 ; CI-NEXT: s_min_u32 s6, s6, s16
3133 ; CI-NEXT: s_min_u32 s4, s4, s12
3134 ; CI-NEXT: s_min_u32 s2, s2, s10
3135 ; CI-NEXT: s_min_u32 s9, s9, s15
3136 ; CI-NEXT: s_lshl_b32 s8, s8, 16
3137 ; CI-NEXT: s_min_u32 s7, s7, s14
3138 ; CI-NEXT: s_lshl_b32 s6, s6, 16
3139 ; CI-NEXT: s_min_u32 s5, s5, s13
3140 ; CI-NEXT: s_lshl_b32 s4, s4, 16
3141 ; CI-NEXT: s_min_u32 s3, s3, s11
3142 ; CI-NEXT: s_lshl_b32 s2, s2, 16
3143 ; CI-NEXT: s_or_b32 s8, s9, s8
3144 ; CI-NEXT: s_or_b32 s6, s7, s6
3145 ; CI-NEXT: s_or_b32 s4, s5, s4
3146 ; CI-NEXT: s_or_b32 s2, s3, s2
3147 ; CI-NEXT: v_mov_b32_e32 v5, s1
3148 ; CI-NEXT: v_mov_b32_e32 v0, s2
3149 ; CI-NEXT: v_mov_b32_e32 v1, s4
3150 ; CI-NEXT: v_mov_b32_e32 v2, s6
3151 ; CI-NEXT: v_mov_b32_e32 v3, s8
3152 ; CI-NEXT: v_mov_b32_e32 v4, s0
3153 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3156 ; VI-LABEL: s_test_umin_ult_v8i16:
3158 ; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10
3159 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3160 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3161 ; VI-NEXT: s_lshr_b32 s2, s11, 16
3162 ; VI-NEXT: s_lshr_b32 s4, s10, 16
3163 ; VI-NEXT: s_and_b32 s5, s10, 0xffff
3164 ; VI-NEXT: s_lshr_b32 s10, s15, 16
3165 ; VI-NEXT: s_and_b32 s3, s11, 0xffff
3166 ; VI-NEXT: s_and_b32 s11, s15, 0xffff
3167 ; VI-NEXT: s_lshr_b32 s15, s14, 16
3168 ; VI-NEXT: s_min_u32 s2, s2, s10
3169 ; VI-NEXT: s_lshr_b32 s6, s9, 16
3170 ; VI-NEXT: s_and_b32 s7, s9, 0xffff
3171 ; VI-NEXT: s_lshr_b32 s9, s8, 16
3172 ; VI-NEXT: s_and_b32 s14, s14, 0xffff
3173 ; VI-NEXT: s_lshr_b32 s16, s13, 16
3174 ; VI-NEXT: s_lshr_b32 s17, s12, 16
3175 ; VI-NEXT: s_min_u32 s4, s4, s15
3176 ; VI-NEXT: s_min_u32 s3, s3, s11
3177 ; VI-NEXT: s_lshl_b32 s2, s2, 16
3178 ; VI-NEXT: s_and_b32 s8, s8, 0xffff
3179 ; VI-NEXT: s_and_b32 s13, s13, 0xffff
3180 ; VI-NEXT: s_and_b32 s12, s12, 0xffff
3181 ; VI-NEXT: s_min_u32 s9, s9, s17
3182 ; VI-NEXT: s_min_u32 s6, s6, s16
3183 ; VI-NEXT: s_min_u32 s5, s5, s14
3184 ; VI-NEXT: s_or_b32 s2, s3, s2
3185 ; VI-NEXT: s_lshl_b32 s3, s4, 16
3186 ; VI-NEXT: s_min_u32 s8, s8, s12
3187 ; VI-NEXT: s_min_u32 s7, s7, s13
3188 ; VI-NEXT: s_or_b32 s3, s5, s3
3189 ; VI-NEXT: s_lshl_b32 s4, s6, 16
3190 ; VI-NEXT: s_lshl_b32 s5, s9, 16
3191 ; VI-NEXT: s_or_b32 s4, s7, s4
3192 ; VI-NEXT: s_or_b32 s5, s8, s5
3193 ; VI-NEXT: v_mov_b32_e32 v5, s1
3194 ; VI-NEXT: v_mov_b32_e32 v0, s5
3195 ; VI-NEXT: v_mov_b32_e32 v1, s4
3196 ; VI-NEXT: v_mov_b32_e32 v2, s3
3197 ; VI-NEXT: v_mov_b32_e32 v3, s2
3198 ; VI-NEXT: v_mov_b32_e32 v4, s0
3199 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3202 ; GFX9-LABEL: s_test_umin_ult_v8i16:
3204 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10
3205 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3206 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
3207 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3208 ; GFX9-NEXT: v_mov_b32_e32 v0, s15
3209 ; GFX9-NEXT: v_mov_b32_e32 v1, s14
3210 ; GFX9-NEXT: v_pk_min_u16 v3, s11, v0
3211 ; GFX9-NEXT: v_mov_b32_e32 v0, s13
3212 ; GFX9-NEXT: v_pk_min_u16 v2, s10, v1
3213 ; GFX9-NEXT: v_pk_min_u16 v1, s9, v0
3214 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
3215 ; GFX9-NEXT: v_pk_min_u16 v0, s8, v0
3216 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3217 ; GFX9-NEXT: s_endpgm
3219 ; GFX10-LABEL: s_test_umin_ult_v8i16:
3221 ; GFX10-NEXT: s_clause 0x1
3222 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10
3223 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3224 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
3225 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3226 ; GFX10-NEXT: v_pk_min_u16 v3, s11, s15
3227 ; GFX10-NEXT: v_pk_min_u16 v2, s10, s14
3228 ; GFX10-NEXT: v_pk_min_u16 v1, s9, s13
3229 ; GFX10-NEXT: v_pk_min_u16 v0, s8, s12
3230 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3231 ; GFX10-NEXT: s_endpgm
3233 ; GFX11-LABEL: s_test_umin_ult_v8i16:
3235 ; GFX11-NEXT: s_clause 0x1
3236 ; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10
3237 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
3238 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
3239 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3240 ; GFX11-NEXT: v_pk_min_u16 v3, s7, s11
3241 ; GFX11-NEXT: v_pk_min_u16 v2, s6, s10
3242 ; GFX11-NEXT: v_pk_min_u16 v1, s5, s9
3243 ; GFX11-NEXT: v_pk_min_u16 v0, s4, s8
3244 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
3245 ; GFX11-NEXT: s_nop 0
3246 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3247 ; GFX11-NEXT: s_endpgm
3248 %cmp = icmp ult <8 x i16> %a, %b
3249 %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
3250 store <8 x i16> %val, ptr addrspace(1) %out
3254 ; Make sure redundant and removed
3256 define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
3257 ; EG-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3259 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
3261 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
3262 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
3265 ; EG-NEXT: Fetch clause starting at 6:
3266 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 72, #3
3267 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 108, #3
3268 ; EG-NEXT: ALU clause starting at 10:
3269 ; EG-NEXT: MOV * T0.X, 0.0,
3270 ; EG-NEXT: ALU clause starting at 11:
3271 ; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
3272 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
3273 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3274 ; EG-NEXT: MIN_UINT T0.X, PV.Z, PV.W,
3275 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3276 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3278 ; CI-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3280 ; CI-NEXT: s_load_dword s2, s[6:7], 0xa
3281 ; CI-NEXT: s_load_dword s3, s[6:7], 0x13
3282 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3283 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3284 ; CI-NEXT: s_and_b32 s2, s2, 0xffff
3285 ; CI-NEXT: s_and_b32 s3, s3, 0xffff
3286 ; CI-NEXT: s_min_u32 s2, s2, s3
3287 ; CI-NEXT: v_mov_b32_e32 v0, s0
3288 ; CI-NEXT: v_mov_b32_e32 v1, s1
3289 ; CI-NEXT: v_mov_b32_e32 v2, s2
3290 ; CI-NEXT: flat_store_dword v[0:1], v2
3293 ; VI-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3295 ; VI-NEXT: s_load_dword s2, s[6:7], 0x28
3296 ; VI-NEXT: s_load_dword s3, s[6:7], 0x4c
3297 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3298 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3299 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
3300 ; VI-NEXT: s_and_b32 s3, s3, 0xffff
3301 ; VI-NEXT: s_min_u32 s2, s2, s3
3302 ; VI-NEXT: v_mov_b32_e32 v0, s0
3303 ; VI-NEXT: v_mov_b32_e32 v1, s1
3304 ; VI-NEXT: v_mov_b32_e32 v2, s2
3305 ; VI-NEXT: flat_store_dword v[0:1], v2
3308 ; GFX9-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3310 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28
3311 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c
3312 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3313 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3314 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3315 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
3316 ; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
3317 ; GFX9-NEXT: s_min_u32 s2, s2, s3
3318 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3319 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
3320 ; GFX9-NEXT: s_endpgm
3322 ; GFX10-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3324 ; GFX10-NEXT: s_clause 0x2
3325 ; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28
3326 ; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c
3327 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3328 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
3329 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3330 ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
3331 ; GFX10-NEXT: s_and_b32 s3, s3, 0xffff
3332 ; GFX10-NEXT: s_min_u32 s2, s2, s3
3333 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
3334 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
3335 ; GFX10-NEXT: s_endpgm
3337 ; GFX11-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3339 ; GFX11-NEXT: s_clause 0x2
3340 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28
3341 ; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c
3342 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
3343 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3344 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3345 ; GFX11-NEXT: s_and_b32 s2, s4, 0xffff
3346 ; GFX11-NEXT: s_and_b32 s3, s5, 0xffff
3347 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3348 ; GFX11-NEXT: s_min_u32 s2, s2, s3
3349 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
3350 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
3351 ; GFX11-NEXT: s_nop 0
3352 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3353 ; GFX11-NEXT: s_endpgm
3354 %a.ext = zext i16 %a to i32
3355 %b.ext = zext i16 %b to i32
3356 %cmp = icmp ult i32 %a.ext, %b.ext
3357 %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
3358 %mask = and i32 %val, 65535
3359 store i32 %mask, ptr addrspace(1) %out
3363 ; Make sure redundant sign_extend_inreg removed.
3365 define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace(1) %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 {
3366 ; EG-LABEL: simplify_demanded_bits_test_min_slt_i16:
3368 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
3370 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
3371 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
3374 ; EG-NEXT: Fetch clause starting at 6:
3375 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 72, #3
3376 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 108, #3
3377 ; EG-NEXT: ALU clause starting at 10:
3378 ; EG-NEXT: MOV * T0.X, 0.0,
3379 ; EG-NEXT: ALU clause starting at 11:
3380 ; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
3381 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
3382 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3383 ; EG-NEXT: MIN_INT T0.X, PV.Z, PV.W,
3384 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3385 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3387 ; CI-LABEL: simplify_demanded_bits_test_min_slt_i16:
3389 ; CI-NEXT: s_load_dword s2, s[6:7], 0xa
3390 ; CI-NEXT: s_load_dword s3, s[6:7], 0x13
3391 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3392 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3393 ; CI-NEXT: s_sext_i32_i16 s2, s2
3394 ; CI-NEXT: s_sext_i32_i16 s3, s3
3395 ; CI-NEXT: s_min_i32 s2, s2, s3
3396 ; CI-NEXT: v_mov_b32_e32 v0, s0
3397 ; CI-NEXT: v_mov_b32_e32 v1, s1
3398 ; CI-NEXT: v_mov_b32_e32 v2, s2
3399 ; CI-NEXT: flat_store_dword v[0:1], v2
3402 ; VI-LABEL: simplify_demanded_bits_test_min_slt_i16:
3404 ; VI-NEXT: s_load_dword s2, s[6:7], 0x28
3405 ; VI-NEXT: s_load_dword s3, s[6:7], 0x4c
3406 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3407 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3408 ; VI-NEXT: s_sext_i32_i16 s2, s2
3409 ; VI-NEXT: s_sext_i32_i16 s3, s3
3410 ; VI-NEXT: s_min_i32 s2, s2, s3
3411 ; VI-NEXT: v_mov_b32_e32 v0, s0
3412 ; VI-NEXT: v_mov_b32_e32 v1, s1
3413 ; VI-NEXT: v_mov_b32_e32 v2, s2
3414 ; VI-NEXT: flat_store_dword v[0:1], v2
3417 ; GFX9-LABEL: simplify_demanded_bits_test_min_slt_i16:
3419 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28
3420 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c
3421 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3422 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3423 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3424 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
3425 ; GFX9-NEXT: s_sext_i32_i16 s3, s3
3426 ; GFX9-NEXT: s_min_i32 s2, s2, s3
3427 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3428 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
3429 ; GFX9-NEXT: s_endpgm
3431 ; GFX10-LABEL: simplify_demanded_bits_test_min_slt_i16:
3433 ; GFX10-NEXT: s_clause 0x2
3434 ; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28
3435 ; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c
3436 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3437 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
3438 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3439 ; GFX10-NEXT: s_sext_i32_i16 s2, s2
3440 ; GFX10-NEXT: s_sext_i32_i16 s3, s3
3441 ; GFX10-NEXT: s_min_i32 s2, s2, s3
3442 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
3443 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
3444 ; GFX10-NEXT: s_endpgm
3446 ; GFX11-LABEL: simplify_demanded_bits_test_min_slt_i16:
3448 ; GFX11-NEXT: s_clause 0x2
3449 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28
3450 ; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c
3451 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
3452 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3453 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3454 ; GFX11-NEXT: s_sext_i32_i16 s2, s4
3455 ; GFX11-NEXT: s_sext_i32_i16 s3, s5
3456 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3457 ; GFX11-NEXT: s_min_i32 s2, s2, s3
3458 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
3459 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
3460 ; GFX11-NEXT: s_nop 0
3461 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3462 ; GFX11-NEXT: s_endpgm
3463 %a.ext = sext i16 %a to i32
3464 %b.ext = sext i16 %b to i32
3465 %cmp = icmp slt i32 %a.ext, %b.ext
3466 %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
3467 %shl = shl i32 %val, 16
3468 %sextinreg = ashr i32 %shl, 16
3469 store i32 %sextinreg, ptr addrspace(1) %out
3473 define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i16 %b) #0 {
3474 ; EG-LABEL: s_test_imin_sle_i16:
3476 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
3478 ; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
3479 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
3482 ; EG-NEXT: Fetch clause starting at 6:
3483 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
3484 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 42, #3
3485 ; EG-NEXT: ALU clause starting at 10:
3486 ; EG-NEXT: MOV * T0.X, 0.0,
3487 ; EG-NEXT: ALU clause starting at 11:
3488 ; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
3489 ; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
3490 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
3491 ; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45)
3492 ; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
3493 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
3494 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
3495 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
3496 ; EG-NEXT: LSHL T0.X, PV.W, PS,
3497 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
3498 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3499 ; EG-NEXT: MOV T0.Y, 0.0,
3500 ; EG-NEXT: MOV * T0.Z, 0.0,
3501 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3502 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3504 ; CI-LABEL: s_test_imin_sle_i16:
3506 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2
3507 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3508 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3509 ; CI-NEXT: s_sext_i32_i16 s3, s2
3510 ; CI-NEXT: s_ashr_i32 s2, s2, 16
3511 ; CI-NEXT: s_min_i32 s2, s3, s2
3512 ; CI-NEXT: v_mov_b32_e32 v0, s0
3513 ; CI-NEXT: v_mov_b32_e32 v1, s1
3514 ; CI-NEXT: v_mov_b32_e32 v2, s2
3515 ; CI-NEXT: flat_store_short v[0:1], v2
3518 ; VI-LABEL: s_test_imin_sle_i16:
3520 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
3521 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3522 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3523 ; VI-NEXT: s_sext_i32_i16 s3, s2
3524 ; VI-NEXT: s_ashr_i32 s2, s2, 16
3525 ; VI-NEXT: s_min_i32 s2, s3, s2
3526 ; VI-NEXT: v_mov_b32_e32 v0, s0
3527 ; VI-NEXT: v_mov_b32_e32 v1, s1
3528 ; VI-NEXT: v_mov_b32_e32 v2, s2
3529 ; VI-NEXT: flat_store_short v[0:1], v2
3532 ; GFX9-LABEL: s_test_imin_sle_i16:
3534 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8
3535 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3536 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3537 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3538 ; GFX9-NEXT: s_sext_i32_i16 s3, s2
3539 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16
3540 ; GFX9-NEXT: s_min_i32 s2, s3, s2
3541 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3542 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
3543 ; GFX9-NEXT: s_endpgm
3545 ; GFX10-LABEL: s_test_imin_sle_i16:
3547 ; GFX10-NEXT: s_clause 0x1
3548 ; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8
3549 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
3550 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
3551 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3552 ; GFX10-NEXT: s_sext_i32_i16 s3, s2
3553 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16
3554 ; GFX10-NEXT: s_min_i32 s2, s3, s2
3555 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
3556 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
3557 ; GFX10-NEXT: s_endpgm
3559 ; GFX11-LABEL: s_test_imin_sle_i16:
3561 ; GFX11-NEXT: s_clause 0x1
3562 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
3563 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
3564 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3565 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3566 ; GFX11-NEXT: s_sext_i32_i16 s2, s4
3567 ; GFX11-NEXT: s_ashr_i32 s3, s4, 16
3568 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3569 ; GFX11-NEXT: s_min_i32 s2, s2, s3
3570 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
3571 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
3572 ; GFX11-NEXT: s_nop 0
3573 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3574 ; GFX11-NEXT: s_endpgm
3575 %cmp = icmp sle i16 %a, %b
3576 %val = select i1 %cmp, i16 %a, i16 %b
3577 store i16 %val, ptr addrspace(1) %out
3583 define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3584 ; EG-LABEL: test_umin_ult_i64:
3586 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
3587 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3590 ; EG-NEXT: ALU clause starting at 4:
3591 ; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3592 ; EG-NEXT: SETGT_UINT * T0.W, KC0[3].Z, KC0[3].X,
3593 ; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3594 ; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3595 ; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3596 ; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3597 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3598 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3600 ; CI-LABEL: test_umin_ult_i64:
3602 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3603 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
3604 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3605 ; CI-NEXT: v_mov_b32_e32 v0, s0
3606 ; CI-NEXT: v_mov_b32_e32 v1, s4
3607 ; CI-NEXT: v_mov_b32_e32 v2, s5
3608 ; CI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
3609 ; CI-NEXT: v_mov_b32_e32 v1, s1
3610 ; CI-NEXT: s_and_b64 s[0:1], vcc, exec
3611 ; CI-NEXT: s_cselect_b32 s0, s3, s5
3612 ; CI-NEXT: s_cselect_b32 s1, s2, s4
3613 ; CI-NEXT: v_mov_b32_e32 v2, s1
3614 ; CI-NEXT: v_mov_b32_e32 v3, s0
3615 ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3618 ; VI-LABEL: test_umin_ult_i64:
3620 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3621 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
3622 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3623 ; VI-NEXT: v_mov_b32_e32 v0, s0
3624 ; VI-NEXT: v_mov_b32_e32 v1, s4
3625 ; VI-NEXT: v_mov_b32_e32 v2, s5
3626 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
3627 ; VI-NEXT: v_mov_b32_e32 v1, s1
3628 ; VI-NEXT: s_and_b64 s[0:1], vcc, exec
3629 ; VI-NEXT: s_cselect_b32 s0, s3, s5
3630 ; VI-NEXT: s_cselect_b32 s1, s2, s4
3631 ; VI-NEXT: v_mov_b32_e32 v2, s1
3632 ; VI-NEXT: v_mov_b32_e32 v3, s0
3633 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3636 ; GFX9-LABEL: test_umin_ult_i64:
3638 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
3639 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3641 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3642 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3643 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3644 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3645 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
3646 ; GFX9-NEXT: s_cselect_b32 s3, s3, s5
3647 ; GFX9-NEXT: s_cselect_b32 s2, s2, s4
3648 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3649 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3650 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3651 ; GFX9-NEXT: s_endpgm
3653 ; GFX10-LABEL: test_umin_ult_i64:
3655 ; GFX10-NEXT: s_clause 0x1
3656 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3657 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
3658 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3659 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3660 ; GFX10-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5]
3661 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
3662 ; GFX10-NEXT: s_cselect_b32 s2, s2, s4
3663 ; GFX10-NEXT: s_cselect_b32 s3, s3, s5
3664 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
3665 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
3666 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3667 ; GFX10-NEXT: s_endpgm
3669 ; GFX11-LABEL: test_umin_ult_i64:
3671 ; GFX11-NEXT: s_clause 0x1
3672 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
3673 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
3674 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
3675 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3676 ; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[6:7], s[0:1]
3677 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3678 ; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
3679 ; GFX11-NEXT: s_cselect_b32 s0, s6, s0
3680 ; GFX11-NEXT: s_cselect_b32 s1, s7, s1
3681 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
3682 ; GFX11-NEXT: v_mov_b32_e32 v1, s1
3683 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
3684 ; GFX11-NEXT: s_nop 0
3685 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3686 ; GFX11-NEXT: s_endpgm
3687 %tmp = icmp ult i64 %a, %b
3688 %val = select i1 %tmp, i64 %a, i64 %b
3689 store i64 %val, ptr addrspace(1) %out, align 8
3693 define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3694 ; EG-LABEL: test_umin_ule_i64:
3696 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
3697 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3700 ; EG-NEXT: ALU clause starting at 4:
3701 ; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3702 ; EG-NEXT: SETGT_UINT * T0.W, KC0[3].Z, KC0[3].X,
3703 ; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3704 ; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3705 ; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3706 ; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3707 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3708 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3710 ; CI-LABEL: test_umin_ule_i64:
3712 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3713 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
3714 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3715 ; CI-NEXT: v_mov_b32_e32 v0, s0
3716 ; CI-NEXT: v_mov_b32_e32 v1, s4
3717 ; CI-NEXT: v_mov_b32_e32 v2, s5
3718 ; CI-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[1:2]
3719 ; CI-NEXT: v_mov_b32_e32 v1, s1
3720 ; CI-NEXT: s_and_b64 s[0:1], vcc, exec
3721 ; CI-NEXT: s_cselect_b32 s0, s3, s5
3722 ; CI-NEXT: s_cselect_b32 s1, s2, s4
3723 ; CI-NEXT: v_mov_b32_e32 v2, s1
3724 ; CI-NEXT: v_mov_b32_e32 v3, s0
3725 ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3728 ; VI-LABEL: test_umin_ule_i64:
3730 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3731 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
3732 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3733 ; VI-NEXT: v_mov_b32_e32 v0, s0
3734 ; VI-NEXT: v_mov_b32_e32 v1, s4
3735 ; VI-NEXT: v_mov_b32_e32 v2, s5
3736 ; VI-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[1:2]
3737 ; VI-NEXT: v_mov_b32_e32 v1, s1
3738 ; VI-NEXT: s_and_b64 s[0:1], vcc, exec
3739 ; VI-NEXT: s_cselect_b32 s0, s3, s5
3740 ; VI-NEXT: s_cselect_b32 s1, s2, s4
3741 ; VI-NEXT: v_mov_b32_e32 v2, s1
3742 ; VI-NEXT: v_mov_b32_e32 v3, s0
3743 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3746 ; GFX9-LABEL: test_umin_ule_i64:
3748 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
3749 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3750 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3751 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3752 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3753 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3754 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[0:1]
3755 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
3756 ; GFX9-NEXT: s_cselect_b32 s3, s3, s5
3757 ; GFX9-NEXT: s_cselect_b32 s2, s2, s4
3758 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3759 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3760 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3761 ; GFX9-NEXT: s_endpgm
3763 ; GFX10-LABEL: test_umin_ule_i64:
3765 ; GFX10-NEXT: s_clause 0x1
3766 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3767 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
3768 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3769 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3770 ; GFX10-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5]
3771 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
3772 ; GFX10-NEXT: s_cselect_b32 s2, s2, s4
3773 ; GFX10-NEXT: s_cselect_b32 s3, s3, s5
3774 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
3775 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
3776 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3777 ; GFX10-NEXT: s_endpgm
3779 ; GFX11-LABEL: test_umin_ule_i64:
3781 ; GFX11-NEXT: s_clause 0x1
3782 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
3783 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
3784 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
3785 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3786 ; GFX11-NEXT: v_cmp_le_u64_e64 s2, s[6:7], s[0:1]
3787 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3788 ; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
3789 ; GFX11-NEXT: s_cselect_b32 s0, s6, s0
3790 ; GFX11-NEXT: s_cselect_b32 s1, s7, s1
3791 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
3792 ; GFX11-NEXT: v_mov_b32_e32 v1, s1
3793 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
3794 ; GFX11-NEXT: s_nop 0
3795 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3796 ; GFX11-NEXT: s_endpgm
3797 %tmp = icmp ule i64 %a, %b
3798 %val = select i1 %tmp, i64 %a, i64 %b
3799 store i64 %val, ptr addrspace(1) %out, align 8
3803 define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3804 ; EG-LABEL: test_imin_slt_i64:
3806 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
3807 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3810 ; EG-NEXT: ALU clause starting at 4:
3811 ; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3812 ; EG-NEXT: SETGT_INT * T0.W, KC0[3].Z, KC0[3].X,
3813 ; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3814 ; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3815 ; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3816 ; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3817 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3818 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3820 ; CI-LABEL: test_imin_slt_i64:
3822 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3823 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
3824 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3825 ; CI-NEXT: v_mov_b32_e32 v0, s0
3826 ; CI-NEXT: v_mov_b32_e32 v1, s4
3827 ; CI-NEXT: v_mov_b32_e32 v2, s5
3828 ; CI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
3829 ; CI-NEXT: v_mov_b32_e32 v1, s1
3830 ; CI-NEXT: s_and_b64 s[0:1], vcc, exec
3831 ; CI-NEXT: s_cselect_b32 s0, s3, s5
3832 ; CI-NEXT: s_cselect_b32 s1, s2, s4
3833 ; CI-NEXT: v_mov_b32_e32 v2, s1
3834 ; CI-NEXT: v_mov_b32_e32 v3, s0
3835 ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3838 ; VI-LABEL: test_imin_slt_i64:
3840 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3841 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
3842 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3843 ; VI-NEXT: v_mov_b32_e32 v0, s0
3844 ; VI-NEXT: v_mov_b32_e32 v1, s4
3845 ; VI-NEXT: v_mov_b32_e32 v2, s5
3846 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
3847 ; VI-NEXT: v_mov_b32_e32 v1, s1
3848 ; VI-NEXT: s_and_b64 s[0:1], vcc, exec
3849 ; VI-NEXT: s_cselect_b32 s0, s3, s5
3850 ; VI-NEXT: s_cselect_b32 s1, s2, s4
3851 ; VI-NEXT: v_mov_b32_e32 v2, s1
3852 ; VI-NEXT: v_mov_b32_e32 v3, s0
3853 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3856 ; GFX9-LABEL: test_imin_slt_i64:
3858 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
3859 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3860 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3861 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3862 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3863 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3864 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
3865 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
3866 ; GFX9-NEXT: s_cselect_b32 s3, s3, s5
3867 ; GFX9-NEXT: s_cselect_b32 s2, s2, s4
3868 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3869 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3870 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3871 ; GFX9-NEXT: s_endpgm
3873 ; GFX10-LABEL: test_imin_slt_i64:
3875 ; GFX10-NEXT: s_clause 0x1
3876 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3877 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
3878 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3879 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3880 ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5]
3881 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
3882 ; GFX10-NEXT: s_cselect_b32 s2, s2, s4
3883 ; GFX10-NEXT: s_cselect_b32 s3, s3, s5
3884 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
3885 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
3886 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3887 ; GFX10-NEXT: s_endpgm
3889 ; GFX11-LABEL: test_imin_slt_i64:
3891 ; GFX11-NEXT: s_clause 0x1
3892 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
3893 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
3894 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
3895 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3896 ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[0:1]
3897 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3898 ; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
3899 ; GFX11-NEXT: s_cselect_b32 s0, s6, s0
3900 ; GFX11-NEXT: s_cselect_b32 s1, s7, s1
3901 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
3902 ; GFX11-NEXT: v_mov_b32_e32 v1, s1
3903 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
3904 ; GFX11-NEXT: s_nop 0
3905 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3906 ; GFX11-NEXT: s_endpgm
3907 %tmp = icmp slt i64 %a, %b
3908 %val = select i1 %tmp, i64 %a, i64 %b
3909 store i64 %val, ptr addrspace(1) %out, align 8
3913 define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3914 ; EG-LABEL: test_imin_sle_i64:
3916 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
3917 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3920 ; EG-NEXT: ALU clause starting at 4:
3921 ; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3922 ; EG-NEXT: SETGT_INT * T0.W, KC0[3].Z, KC0[3].X,
3923 ; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3924 ; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3925 ; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3926 ; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3927 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3928 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3930 ; CI-LABEL: test_imin_sle_i64:
3932 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3933 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
3934 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3935 ; CI-NEXT: v_mov_b32_e32 v0, s0
3936 ; CI-NEXT: v_mov_b32_e32 v1, s4
3937 ; CI-NEXT: v_mov_b32_e32 v2, s5
3938 ; CI-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[1:2]
3939 ; CI-NEXT: v_mov_b32_e32 v1, s1
3940 ; CI-NEXT: s_and_b64 s[0:1], vcc, exec
3941 ; CI-NEXT: s_cselect_b32 s0, s3, s5
3942 ; CI-NEXT: s_cselect_b32 s1, s2, s4
3943 ; CI-NEXT: v_mov_b32_e32 v2, s1
3944 ; CI-NEXT: v_mov_b32_e32 v3, s0
3945 ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3948 ; VI-LABEL: test_imin_sle_i64:
3950 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3951 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
3952 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3953 ; VI-NEXT: v_mov_b32_e32 v0, s0
3954 ; VI-NEXT: v_mov_b32_e32 v1, s4
3955 ; VI-NEXT: v_mov_b32_e32 v2, s5
3956 ; VI-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[1:2]
3957 ; VI-NEXT: v_mov_b32_e32 v1, s1
3958 ; VI-NEXT: s_and_b64 s[0:1], vcc, exec
3959 ; VI-NEXT: s_cselect_b32 s0, s3, s5
3960 ; VI-NEXT: s_cselect_b32 s1, s2, s4
3961 ; VI-NEXT: v_mov_b32_e32 v2, s1
3962 ; VI-NEXT: v_mov_b32_e32 v3, s0
3963 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3966 ; GFX9-LABEL: test_imin_sle_i64:
3968 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
3969 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3970 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3971 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3972 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3973 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3974 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[0:1]
3975 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
3976 ; GFX9-NEXT: s_cselect_b32 s3, s3, s5
3977 ; GFX9-NEXT: s_cselect_b32 s2, s2, s4
3978 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3979 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3980 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3981 ; GFX9-NEXT: s_endpgm
3983 ; GFX10-LABEL: test_imin_sle_i64:
3985 ; GFX10-NEXT: s_clause 0x1
3986 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
3987 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
3988 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3989 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3990 ; GFX10-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5]
3991 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
3992 ; GFX10-NEXT: s_cselect_b32 s2, s2, s4
3993 ; GFX10-NEXT: s_cselect_b32 s3, s3, s5
3994 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
3995 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
3996 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3997 ; GFX10-NEXT: s_endpgm
3999 ; GFX11-LABEL: test_imin_sle_i64:
4001 ; GFX11-NEXT: s_clause 0x1
4002 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
4003 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
4004 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
4005 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4006 ; GFX11-NEXT: v_cmp_le_i64_e64 s2, s[6:7], s[0:1]
4007 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4008 ; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
4009 ; GFX11-NEXT: s_cselect_b32 s0, s6, s0
4010 ; GFX11-NEXT: s_cselect_b32 s1, s7, s1
4011 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
4012 ; GFX11-NEXT: v_mov_b32_e32 v1, s1
4013 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
4014 ; GFX11-NEXT: s_nop 0
4015 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4016 ; GFX11-NEXT: s_endpgm
4017 %tmp = icmp sle i64 %a, %b
4018 %val = select i1 %tmp, i64 %a, i64 %b
4019 store i64 %val, ptr addrspace(1) %out, align 8
4023 define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
4024 ; EG-LABEL: v_test_imin_sle_v2i16:
4026 ; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
4028 ; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
4029 ; EG-NEXT: TEX 0 @10
4030 ; EG-NEXT: ALU 16, @16, KC0[CB0:0-32], KC1[]
4031 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
4034 ; EG-NEXT: Fetch clause starting at 8:
4035 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
4036 ; EG-NEXT: Fetch clause starting at 10:
4037 ; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
4038 ; EG-NEXT: ALU clause starting at 12:
4039 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
4040 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4041 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
4042 ; EG-NEXT: ALU clause starting at 15:
4043 ; EG-NEXT: ADD_INT * T7.X, KC0[2].W, T0.W,
4044 ; EG-NEXT: ALU clause starting at 16:
4045 ; EG-NEXT: LSHR T1.W, T0.X, literal.x,
4046 ; EG-NEXT: LSHR * T2.W, T7.X, literal.x,
4047 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4048 ; EG-NEXT: BFE_INT T8.X, PS, 0.0, literal.x,
4049 ; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
4050 ; EG-NEXT: BFE_INT T0.Z, T7.X, 0.0, literal.x,
4051 ; EG-NEXT: BFE_INT * T1.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
4052 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4053 ; EG-NEXT: MIN_INT T1.W, PV.W, PV.Z,
4054 ; EG-NEXT: MIN_INT * T2.W, PV.Y, PV.X,
4055 ; EG-NEXT: LSHL T2.W, PS, literal.x,
4056 ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
4057 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
4058 ; EG-NEXT: OR_INT T0.X, PS, PV.W,
4059 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
4060 ; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
4061 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4063 ; CI-LABEL: v_test_imin_sle_v2i16:
4065 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
4066 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
4067 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
4068 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4069 ; CI-NEXT: v_mov_b32_e32 v1, s3
4070 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
4071 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4072 ; CI-NEXT: v_mov_b32_e32 v3, s5
4073 ; CI-NEXT: flat_load_dword v4, v[0:1]
4074 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
4075 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
4076 ; CI-NEXT: flat_load_dword v3, v[0:1]
4077 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
4078 ; CI-NEXT: v_mov_b32_e32 v1, s1
4079 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4080 ; CI-NEXT: s_waitcnt vmcnt(1)
4081 ; CI-NEXT: v_bfe_i32 v2, v4, 0, 16
4082 ; CI-NEXT: v_ashrrev_i32_e32 v4, 16, v4
4083 ; CI-NEXT: s_waitcnt vmcnt(0)
4084 ; CI-NEXT: v_bfe_i32 v5, v3, 0, 16
4085 ; CI-NEXT: v_ashrrev_i32_e32 v3, 16, v3
4086 ; CI-NEXT: v_min_i32_e32 v3, v4, v3
4087 ; CI-NEXT: v_min_i32_e32 v2, v2, v5
4088 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
4089 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
4090 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
4091 ; CI-NEXT: flat_store_dword v[0:1], v2
4094 ; VI-LABEL: v_test_imin_sle_v2i16:
4096 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
4097 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
4098 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
4099 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4100 ; VI-NEXT: v_mov_b32_e32 v1, s3
4101 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
4102 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4103 ; VI-NEXT: v_mov_b32_e32 v3, s5
4104 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
4105 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
4106 ; VI-NEXT: flat_load_dword v5, v[0:1]
4107 ; VI-NEXT: flat_load_dword v2, v[2:3]
4108 ; VI-NEXT: v_mov_b32_e32 v1, s1
4109 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
4110 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4111 ; VI-NEXT: s_waitcnt vmcnt(0)
4112 ; VI-NEXT: v_min_i16_e32 v3, v5, v2
4113 ; VI-NEXT: v_min_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4114 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
4115 ; VI-NEXT: flat_store_dword v[0:1], v2
4118 ; GFX9-LABEL: v_test_imin_sle_v2i16:
4120 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
4121 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
4122 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4123 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4124 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
4125 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
4126 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4127 ; GFX9-NEXT: v_pk_min_i16 v1, v1, v2
4128 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
4129 ; GFX9-NEXT: s_endpgm
4131 ; GFX10-LABEL: v_test_imin_sle_v2i16:
4133 ; GFX10-NEXT: s_clause 0x1
4134 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
4135 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
4136 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4137 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4138 ; GFX10-NEXT: s_clause 0x1
4139 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
4140 ; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
4141 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4142 ; GFX10-NEXT: v_pk_min_i16 v1, v1, v2
4143 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
4144 ; GFX10-NEXT: s_endpgm
4146 ; GFX11-LABEL: v_test_imin_sle_v2i16:
4148 ; GFX11-NEXT: s_clause 0x1
4149 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
4150 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
4151 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
4152 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4153 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4154 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4155 ; GFX11-NEXT: s_clause 0x1
4156 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
4157 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1]
4158 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4159 ; GFX11-NEXT: v_pk_min_i16 v1, v1, v2
4160 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
4161 ; GFX11-NEXT: s_nop 0
4162 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4163 ; GFX11-NEXT: s_endpgm
4164 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4165 %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
4166 %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
4167 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
4168 %a = load <2 x i16>, ptr addrspace(1) %a.gep
4169 %b = load <2 x i16>, ptr addrspace(1) %b.gep
4170 %cmp = icmp sle <2 x i16> %a, %b
4171 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
4172 store <2 x i16> %val, ptr addrspace(1) %out.gep
4178 define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
4179 ; EG-LABEL: v_test_imin_ule_v2i16:
4181 ; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
4183 ; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
4184 ; EG-NEXT: TEX 0 @10
4185 ; EG-NEXT: ALU 13, @16, KC0[CB0:0-32], KC1[]
4186 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
4189 ; EG-NEXT: Fetch clause starting at 8:
4190 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
4191 ; EG-NEXT: Fetch clause starting at 10:
4192 ; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
4193 ; EG-NEXT: ALU clause starting at 12:
4194 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
4195 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4196 ; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W,
4197 ; EG-NEXT: ALU clause starting at 15:
4198 ; EG-NEXT: ADD_INT * T7.X, KC0[2].Z, T0.W,
4199 ; EG-NEXT: ALU clause starting at 16:
4200 ; EG-NEXT: LSHR T1.W, T0.X, literal.x,
4201 ; EG-NEXT: LSHR * T2.W, T7.X, literal.x,
4202 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4203 ; EG-NEXT: AND_INT T0.Z, T0.X, literal.x,
4204 ; EG-NEXT: AND_INT T3.W, T7.X, literal.x, BS:VEC_120/SCL_212
4205 ; EG-NEXT: MIN_UINT * T1.W, PS, PV.W,
4206 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
4207 ; EG-NEXT: LSHL T1.W, PS, literal.x,
4208 ; EG-NEXT: MIN_UINT * T2.W, PV.W, PV.Z,
4209 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4210 ; EG-NEXT: OR_INT T0.X, PS, PV.W,
4211 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
4212 ; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
4213 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4215 ; CI-LABEL: v_test_imin_ule_v2i16:
4217 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
4218 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4
4219 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
4220 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4221 ; CI-NEXT: v_mov_b32_e32 v1, s3
4222 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
4223 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4224 ; CI-NEXT: v_mov_b32_e32 v3, s5
4225 ; CI-NEXT: flat_load_dword v4, v[0:1]
4226 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
4227 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
4228 ; CI-NEXT: flat_load_dword v3, v[0:1]
4229 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
4230 ; CI-NEXT: v_mov_b32_e32 v1, s1
4231 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4232 ; CI-NEXT: s_waitcnt vmcnt(1)
4233 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
4234 ; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4
4235 ; CI-NEXT: s_waitcnt vmcnt(0)
4236 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
4237 ; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3
4238 ; CI-NEXT: v_min_u32_e32 v2, v2, v5
4239 ; CI-NEXT: v_min_u32_e32 v3, v4, v3
4240 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
4241 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
4242 ; CI-NEXT: flat_store_dword v[0:1], v2
4245 ; VI-LABEL: v_test_imin_ule_v2i16:
4247 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
4248 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
4249 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
4250 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4251 ; VI-NEXT: v_mov_b32_e32 v1, s3
4252 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
4253 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4254 ; VI-NEXT: v_mov_b32_e32 v3, s5
4255 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
4256 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
4257 ; VI-NEXT: flat_load_dword v5, v[0:1]
4258 ; VI-NEXT: flat_load_dword v2, v[2:3]
4259 ; VI-NEXT: v_mov_b32_e32 v1, s1
4260 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
4261 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4262 ; VI-NEXT: s_waitcnt vmcnt(0)
4263 ; VI-NEXT: v_min_u16_e32 v3, v5, v2
4264 ; VI-NEXT: v_min_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4265 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
4266 ; VI-NEXT: flat_store_dword v[0:1], v2
4269 ; GFX9-LABEL: v_test_imin_ule_v2i16:
4271 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
4272 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
4273 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4274 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4275 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
4276 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
4277 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4278 ; GFX9-NEXT: v_pk_min_u16 v1, v1, v2
4279 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
4280 ; GFX9-NEXT: s_endpgm
4282 ; GFX10-LABEL: v_test_imin_ule_v2i16:
4284 ; GFX10-NEXT: s_clause 0x1
4285 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
4286 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
4287 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4288 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4289 ; GFX10-NEXT: s_clause 0x1
4290 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
4291 ; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
4292 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4293 ; GFX10-NEXT: v_pk_min_u16 v1, v1, v2
4294 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
4295 ; GFX10-NEXT: s_endpgm
4297 ; GFX11-LABEL: v_test_imin_ule_v2i16:
4299 ; GFX11-NEXT: s_clause 0x1
4300 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
4301 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10
4302 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
4303 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4304 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4305 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4306 ; GFX11-NEXT: s_clause 0x1
4307 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
4308 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1]
4309 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4310 ; GFX11-NEXT: v_pk_min_u16 v1, v1, v2
4311 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
4312 ; GFX11-NEXT: s_nop 0
4313 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4314 ; GFX11-NEXT: s_endpgm
4315 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4316 %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
4317 %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
4318 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
4319 %a = load <2 x i16>, ptr addrspace(1) %a.gep
4320 %b = load <2 x i16>, ptr addrspace(1) %b.gep
4321 %cmp = icmp ule <2 x i16> %a, %b
4322 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
4323 store <2 x i16> %val, ptr addrspace(1) %out.gep
4327 declare i32 @llvm.amdgcn.workitem.id.x() #1
4329 attributes #0 = { nounwind }
4330 attributes #1 = { nounwind readnone }