1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2 ; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck --check-prefix=EG %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
9 define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
10 ; EG-LABEL: v_test_imin_sle_i32:
12 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
14 ; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
15 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
18 ; EG-NEXT: Fetch clause starting at 6:
19 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
20 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
21 ; EG-NEXT: ALU clause starting at 10:
22 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
23 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
24 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
25 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
26 ; EG-NEXT: ALU clause starting at 14:
27 ; EG-NEXT: MIN_INT T0.X, T0.X, T1.X,
28 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
29 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
30 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
32 ; CI-LABEL: v_test_imin_sle_i32:
34 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
35 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
36 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
37 ; CI-NEXT: s_waitcnt lgkmcnt(0)
38 ; CI-NEXT: v_mov_b32_e32 v1, s3
39 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
40 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
41 ; CI-NEXT: v_mov_b32_e32 v3, s5
42 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
43 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
44 ; CI-NEXT: flat_load_dword v5, v[0:1]
45 ; CI-NEXT: flat_load_dword v2, v[2:3]
46 ; CI-NEXT: v_mov_b32_e32 v1, s1
47 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
48 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
49 ; CI-NEXT: s_waitcnt vmcnt(0)
50 ; CI-NEXT: v_min_i32_e32 v2, v5, v2
51 ; CI-NEXT: flat_store_dword v[0:1], v2
54 ; VI-LABEL: v_test_imin_sle_i32:
56 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
57 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
58 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
59 ; VI-NEXT: s_waitcnt lgkmcnt(0)
60 ; VI-NEXT: v_mov_b32_e32 v1, s3
61 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
62 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
63 ; VI-NEXT: v_mov_b32_e32 v3, s5
64 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
65 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
66 ; VI-NEXT: flat_load_dword v5, v[0:1]
67 ; VI-NEXT: flat_load_dword v2, v[2:3]
68 ; VI-NEXT: v_mov_b32_e32 v1, s1
69 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
70 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
71 ; VI-NEXT: s_waitcnt vmcnt(0)
72 ; VI-NEXT: v_min_i32_e32 v2, v5, v2
73 ; VI-NEXT: flat_store_dword v[0:1], v2
76 ; GFX9-LABEL: v_test_imin_sle_i32:
78 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
79 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
80 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
81 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
82 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
83 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
84 ; GFX9-NEXT: s_waitcnt vmcnt(0)
85 ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2
86 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
89 ; GFX10-LABEL: v_test_imin_sle_i32:
91 ; GFX10-NEXT: s_clause 0x1
92 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
93 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
94 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
95 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX10-NEXT: s_clause 0x1
97 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
98 ; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
99 ; GFX10-NEXT: s_waitcnt vmcnt(0)
100 ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2
101 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
102 ; GFX10-NEXT: s_endpgm
104 ; GFX11-LABEL: v_test_imin_sle_i32:
106 ; GFX11-NEXT: s_clause 0x1
107 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
108 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
109 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
110 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
111 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
112 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
113 ; GFX11-NEXT: s_clause 0x1
114 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
115 ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
116 ; GFX11-NEXT: s_waitcnt vmcnt(0)
117 ; GFX11-NEXT: v_min_i32_e32 v1, v1, v2
118 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
119 ; GFX11-NEXT: s_endpgm
120 %tid = call i32 @llvm.amdgcn.workitem.id.x()
121 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
122 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
123 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
124 %a = load i32, ptr addrspace(1) %a.gep, align 4
125 %b = load i32, ptr addrspace(1) %b.gep, align 4
126 %cmp = icmp sle i32 %a, %b
127 %val = select i1 %cmp, i32 %a, i32 %b
128 store i32 %val, ptr addrspace(1) %out.gep, align 4
132 define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
133 ; EG-LABEL: s_test_imin_sle_i32:
135 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
136 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
139 ; EG-NEXT: ALU clause starting at 4:
140 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
141 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
142 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
144 ; CI-LABEL: s_test_imin_sle_i32:
146 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
147 ; CI-NEXT: s_waitcnt lgkmcnt(0)
148 ; CI-NEXT: s_min_i32 s2, s2, s3
149 ; CI-NEXT: v_mov_b32_e32 v0, s0
150 ; CI-NEXT: v_mov_b32_e32 v1, s1
151 ; CI-NEXT: v_mov_b32_e32 v2, s2
152 ; CI-NEXT: flat_store_dword v[0:1], v2
155 ; VI-LABEL: s_test_imin_sle_i32:
157 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
158 ; VI-NEXT: s_waitcnt lgkmcnt(0)
159 ; VI-NEXT: s_min_i32 s2, s2, s3
160 ; VI-NEXT: v_mov_b32_e32 v0, s0
161 ; VI-NEXT: v_mov_b32_e32 v1, s1
162 ; VI-NEXT: v_mov_b32_e32 v2, s2
163 ; VI-NEXT: flat_store_dword v[0:1], v2
166 ; GFX9-LABEL: s_test_imin_sle_i32:
168 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
169 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
170 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
171 ; GFX9-NEXT: s_min_i32 s2, s2, s3
172 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
173 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
174 ; GFX9-NEXT: s_endpgm
176 ; GFX10-LABEL: s_test_imin_sle_i32:
178 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
179 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
180 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
181 ; GFX10-NEXT: s_min_i32 s2, s2, s3
182 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
183 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
184 ; GFX10-NEXT: s_endpgm
186 ; GFX11-LABEL: s_test_imin_sle_i32:
188 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
189 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
190 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
191 ; GFX11-NEXT: s_min_i32 s2, s2, s3
192 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
193 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
194 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
195 ; GFX11-NEXT: s_endpgm
196 %cmp = icmp sle i32 %a, %b
197 %val = select i1 %cmp, i32 %a, i32 %b
198 store i32 %val, ptr addrspace(1) %out, align 4
202 define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
203 ; EG-LABEL: s_test_imin_sle_v1i32:
205 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
206 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
209 ; EG-NEXT: ALU clause starting at 4:
210 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
211 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
212 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
214 ; CI-LABEL: s_test_imin_sle_v1i32:
216 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
217 ; CI-NEXT: s_waitcnt lgkmcnt(0)
218 ; CI-NEXT: s_min_i32 s2, s2, s3
219 ; CI-NEXT: v_mov_b32_e32 v0, s0
220 ; CI-NEXT: v_mov_b32_e32 v1, s1
221 ; CI-NEXT: v_mov_b32_e32 v2, s2
222 ; CI-NEXT: flat_store_dword v[0:1], v2
225 ; VI-LABEL: s_test_imin_sle_v1i32:
227 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
228 ; VI-NEXT: s_waitcnt lgkmcnt(0)
229 ; VI-NEXT: s_min_i32 s2, s2, s3
230 ; VI-NEXT: v_mov_b32_e32 v0, s0
231 ; VI-NEXT: v_mov_b32_e32 v1, s1
232 ; VI-NEXT: v_mov_b32_e32 v2, s2
233 ; VI-NEXT: flat_store_dword v[0:1], v2
236 ; GFX9-LABEL: s_test_imin_sle_v1i32:
238 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
239 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
240 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
241 ; GFX9-NEXT: s_min_i32 s2, s2, s3
242 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
243 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
244 ; GFX9-NEXT: s_endpgm
246 ; GFX10-LABEL: s_test_imin_sle_v1i32:
248 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
249 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
250 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
251 ; GFX10-NEXT: s_min_i32 s2, s2, s3
252 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
253 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
254 ; GFX10-NEXT: s_endpgm
256 ; GFX11-LABEL: s_test_imin_sle_v1i32:
258 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
259 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
260 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
261 ; GFX11-NEXT: s_min_i32 s2, s2, s3
262 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
263 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
264 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
265 ; GFX11-NEXT: s_endpgm
266 %cmp = icmp sle <1 x i32> %a, %b
267 %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
268 store <1 x i32> %val, ptr addrspace(1) %out
272 define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b) #0 {
273 ; EG-LABEL: s_test_imin_sle_v4i32:
275 ; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
276 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
279 ; EG-NEXT: ALU clause starting at 4:
280 ; EG-NEXT: MIN_INT * T0.W, KC0[4].X, KC0[5].X,
281 ; EG-NEXT: MIN_INT * T0.Z, KC0[3].W, KC0[4].W,
282 ; EG-NEXT: MIN_INT * T0.Y, KC0[3].Z, KC0[4].Z,
283 ; EG-NEXT: MIN_INT * T0.X, KC0[3].Y, KC0[4].Y,
284 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
285 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
287 ; CI-LABEL: s_test_imin_sle_v4i32:
289 ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
290 ; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
291 ; CI-NEXT: s_waitcnt lgkmcnt(0)
292 ; CI-NEXT: s_min_i32 s3, s3, s7
293 ; CI-NEXT: s_min_i32 s2, s2, s6
294 ; CI-NEXT: s_min_i32 s1, s1, s5
295 ; CI-NEXT: s_min_i32 s0, s0, s4
296 ; CI-NEXT: v_mov_b32_e32 v4, s8
297 ; CI-NEXT: v_mov_b32_e32 v0, s0
298 ; CI-NEXT: v_mov_b32_e32 v1, s1
299 ; CI-NEXT: v_mov_b32_e32 v2, s2
300 ; CI-NEXT: v_mov_b32_e32 v3, s3
301 ; CI-NEXT: v_mov_b32_e32 v5, s9
302 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
305 ; VI-LABEL: s_test_imin_sle_v4i32:
307 ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
308 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
309 ; VI-NEXT: s_waitcnt lgkmcnt(0)
310 ; VI-NEXT: s_min_i32 s3, s3, s7
311 ; VI-NEXT: s_min_i32 s2, s2, s6
312 ; VI-NEXT: s_min_i32 s1, s1, s5
313 ; VI-NEXT: s_min_i32 s0, s0, s4
314 ; VI-NEXT: v_mov_b32_e32 v4, s8
315 ; VI-NEXT: v_mov_b32_e32 v0, s0
316 ; VI-NEXT: v_mov_b32_e32 v1, s1
317 ; VI-NEXT: v_mov_b32_e32 v2, s2
318 ; VI-NEXT: v_mov_b32_e32 v3, s3
319 ; VI-NEXT: v_mov_b32_e32 v5, s9
320 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
323 ; GFX9-LABEL: s_test_imin_sle_v4i32:
325 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
326 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
327 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
328 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
329 ; GFX9-NEXT: s_min_i32 s3, s3, s7
330 ; GFX9-NEXT: s_min_i32 s2, s2, s6
331 ; GFX9-NEXT: s_min_i32 s1, s1, s5
332 ; GFX9-NEXT: s_min_i32 s0, s0, s4
333 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
334 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
335 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
336 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
337 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
338 ; GFX9-NEXT: s_endpgm
340 ; GFX10-LABEL: s_test_imin_sle_v4i32:
342 ; GFX10-NEXT: s_clause 0x1
343 ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
344 ; GFX10-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
345 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
346 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
347 ; GFX10-NEXT: s_min_i32 s3, s3, s7
348 ; GFX10-NEXT: s_min_i32 s2, s2, s6
349 ; GFX10-NEXT: s_min_i32 s0, s0, s4
350 ; GFX10-NEXT: s_min_i32 s1, s1, s5
351 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
352 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
353 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
354 ; GFX10-NEXT: v_mov_b32_e32 v3, s3
355 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11]
356 ; GFX10-NEXT: s_endpgm
358 ; GFX11-LABEL: s_test_imin_sle_v4i32:
360 ; GFX11-NEXT: s_clause 0x1
361 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x10
362 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
363 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
364 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
365 ; GFX11-NEXT: s_min_i32 s2, s11, s15
366 ; GFX11-NEXT: s_min_i32 s3, s10, s14
367 ; GFX11-NEXT: s_min_i32 s4, s8, s12
368 ; GFX11-NEXT: s_min_i32 s5, s9, s13
369 ; GFX11-NEXT: v_mov_b32_e32 v0, s4
370 ; GFX11-NEXT: v_mov_b32_e32 v1, s5
371 ; GFX11-NEXT: v_mov_b32_e32 v2, s3
372 ; GFX11-NEXT: v_mov_b32_e32 v3, s2
373 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
374 ; GFX11-NEXT: s_endpgm
375 %cmp = icmp sle <4 x i32> %a, %b
376 %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
377 store <4 x i32> %val, ptr addrspace(1) %out
381 define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 {
382 ; EG-LABEL: s_test_imin_sle_i8:
384 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
386 ; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
387 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
390 ; EG-NEXT: Fetch clause starting at 6:
391 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 72, #3
392 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 108, #3
393 ; EG-NEXT: ALU clause starting at 10:
394 ; EG-NEXT: MOV * T0.X, 0.0,
395 ; EG-NEXT: ALU clause starting at 11:
396 ; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
397 ; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
398 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
399 ; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
400 ; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
401 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
402 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
403 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
404 ; EG-NEXT: LSHL T0.X, PV.W, PS,
405 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
406 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
407 ; EG-NEXT: MOV T0.Y, 0.0,
408 ; EG-NEXT: MOV * T0.Z, 0.0,
409 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
410 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
412 ; CI-LABEL: s_test_imin_sle_i8:
414 ; CI-NEXT: s_load_dword s2, s[8:9], 0xa
415 ; CI-NEXT: s_load_dword s3, s[8:9], 0x13
416 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
417 ; CI-NEXT: s_waitcnt lgkmcnt(0)
418 ; CI-NEXT: s_sext_i32_i8 s2, s2
419 ; CI-NEXT: s_sext_i32_i8 s3, s3
420 ; CI-NEXT: s_min_i32 s2, s2, s3
421 ; CI-NEXT: v_mov_b32_e32 v0, s0
422 ; CI-NEXT: v_mov_b32_e32 v1, s1
423 ; CI-NEXT: v_mov_b32_e32 v2, s2
424 ; CI-NEXT: flat_store_byte v[0:1], v2
427 ; VI-LABEL: s_test_imin_sle_i8:
429 ; VI-NEXT: s_load_dword s2, s[8:9], 0x28
430 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
431 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
432 ; VI-NEXT: s_waitcnt lgkmcnt(0)
433 ; VI-NEXT: s_sext_i32_i8 s2, s2
434 ; VI-NEXT: s_sext_i32_i8 s3, s3
435 ; VI-NEXT: s_min_i32 s2, s2, s3
436 ; VI-NEXT: v_mov_b32_e32 v0, s0
437 ; VI-NEXT: v_mov_b32_e32 v1, s1
438 ; VI-NEXT: v_mov_b32_e32 v2, s2
439 ; VI-NEXT: flat_store_byte v[0:1], v2
442 ; GFX9-LABEL: s_test_imin_sle_i8:
444 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28
445 ; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c
446 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
447 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
448 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
449 ; GFX9-NEXT: s_sext_i32_i8 s2, s2
450 ; GFX9-NEXT: s_sext_i32_i8 s3, s3
451 ; GFX9-NEXT: s_min_i32 s2, s2, s3
452 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
453 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
454 ; GFX9-NEXT: s_endpgm
456 ; GFX10-LABEL: s_test_imin_sle_i8:
458 ; GFX10-NEXT: s_clause 0x2
459 ; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28
460 ; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c
461 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
462 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
463 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
464 ; GFX10-NEXT: s_sext_i32_i8 s2, s2
465 ; GFX10-NEXT: s_sext_i32_i8 s3, s3
466 ; GFX10-NEXT: s_min_i32 s2, s2, s3
467 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
468 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
469 ; GFX10-NEXT: s_endpgm
471 ; GFX11-LABEL: s_test_imin_sle_i8:
473 ; GFX11-NEXT: s_clause 0x2
474 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28
475 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c
476 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
477 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
478 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
479 ; GFX11-NEXT: s_sext_i32_i8 s2, s2
480 ; GFX11-NEXT: s_sext_i32_i8 s3, s3
481 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
482 ; GFX11-NEXT: s_min_i32 s2, s2, s3
483 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
484 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1]
485 ; GFX11-NEXT: s_endpgm
486 %cmp = icmp sle i8 %a, %b
487 %val = select i1 %cmp, i8 %a, i8 %b
488 store i8 %val, ptr addrspace(1) %out
492 ; FIXME: Why vector and sdwa for last element?
494 define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 {
495 ; EG-LABEL: s_test_imin_sle_v4i8:
497 ; EG-NEXT: ALU 0, @22, KC0[], KC1[]
499 ; EG-NEXT: ALU 30, @23, KC0[CB0:0-32], KC1[]
500 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
503 ; EG-NEXT: Fetch clause starting at 6:
504 ; EG-NEXT: VTX_READ_8 T5.X, T4.X, 74, #3
505 ; EG-NEXT: VTX_READ_8 T6.X, T4.X, 108, #3
506 ; EG-NEXT: VTX_READ_8 T7.X, T4.X, 72, #3
507 ; EG-NEXT: VTX_READ_8 T8.X, T4.X, 111, #3
508 ; EG-NEXT: VTX_READ_8 T9.X, T4.X, 75, #3
509 ; EG-NEXT: VTX_READ_8 T10.X, T4.X, 109, #3
510 ; EG-NEXT: VTX_READ_8 T11.X, T4.X, 73, #3
511 ; EG-NEXT: VTX_READ_8 T4.X, T4.X, 110, #3
512 ; EG-NEXT: ALU clause starting at 22:
513 ; EG-NEXT: MOV * T4.X, 0.0,
514 ; EG-NEXT: ALU clause starting at 23:
515 ; EG-NEXT: BFE_INT T0.Z, T5.X, 0.0, literal.x,
516 ; EG-NEXT: BFE_INT * T0.W, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212
517 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
518 ; EG-NEXT: BFE_INT T4.X, T11.X, 0.0, literal.x,
519 ; EG-NEXT: BFE_INT T0.Y, T10.X, 0.0, literal.x, BS:VEC_120/SCL_212
520 ; EG-NEXT: BFE_INT * T1.Z, T9.X, 0.0, literal.x, BS:VEC_201
521 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
522 ; EG-NEXT: BFE_INT T1.W, T8.X, 0.0, literal.x,
523 ; EG-NEXT: MIN_INT * T0.W, T0.Z, T0.W,
524 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
525 ; EG-NEXT: MIN_INT T0.Z, T1.Z, PV.W,
526 ; EG-NEXT: AND_INT T0.W, PS, literal.x,
527 ; EG-NEXT: MIN_INT * T1.W, T4.X, T0.Y,
528 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
529 ; EG-NEXT: AND_INT T4.X, PS, literal.x,
530 ; EG-NEXT: LSHL T0.Y, PV.W, literal.y,
531 ; EG-NEXT: BFE_INT T1.Z, T7.X, 0.0, literal.z,
532 ; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.z, BS:VEC_120/SCL_212
533 ; EG-NEXT: LSHL * T1.W, PV.Z, literal.w,
534 ; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
535 ; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
536 ; EG-NEXT: MIN_INT T0.Z, PV.Z, PV.W,
537 ; EG-NEXT: OR_INT T0.W, PS, PV.Y,
538 ; EG-NEXT: LSHL * T1.W, PV.X, literal.x,
539 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
540 ; EG-NEXT: OR_INT T0.W, PV.W, PS,
541 ; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x,
542 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
543 ; EG-NEXT: OR_INT T4.X, PV.W, PS,
544 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
545 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
547 ; CI-LABEL: s_test_imin_sle_v4i8:
549 ; CI-NEXT: s_load_dword s2, s[8:9], 0xa
550 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
551 ; CI-NEXT: s_load_dword s3, s[8:9], 0x13
552 ; CI-NEXT: s_waitcnt lgkmcnt(0)
553 ; CI-NEXT: s_ashr_i32 s4, s2, 24
554 ; CI-NEXT: s_sext_i32_i8 s5, s2
555 ; CI-NEXT: s_bfe_i32 s6, s2, 0x80008
556 ; CI-NEXT: s_bfe_i32 s2, s2, 0x80010
557 ; CI-NEXT: s_ashr_i32 s7, s3, 24
558 ; CI-NEXT: s_sext_i32_i8 s8, s3
559 ; CI-NEXT: s_bfe_i32 s9, s3, 0x80008
560 ; CI-NEXT: s_bfe_i32 s3, s3, 0x80010
561 ; CI-NEXT: s_min_i32 s2, s2, s3
562 ; CI-NEXT: s_min_i32 s4, s4, s7
563 ; CI-NEXT: s_and_b32 s2, s2, 0xff
564 ; CI-NEXT: s_lshl_b32 s4, s4, 24
565 ; CI-NEXT: s_lshl_b32 s2, s2, 16
566 ; CI-NEXT: s_or_b32 s2, s4, s2
567 ; CI-NEXT: s_min_i32 s3, s6, s9
568 ; CI-NEXT: s_min_i32 s4, s5, s8
569 ; CI-NEXT: s_lshl_b32 s3, s3, 8
570 ; CI-NEXT: s_and_b32 s4, s4, 0xff
571 ; CI-NEXT: s_or_b32 s3, s4, s3
572 ; CI-NEXT: s_and_b32 s3, s3, 0xffff
573 ; CI-NEXT: s_or_b32 s2, s3, s2
574 ; CI-NEXT: v_mov_b32_e32 v0, s0
575 ; CI-NEXT: v_mov_b32_e32 v1, s1
576 ; CI-NEXT: v_mov_b32_e32 v2, s2
577 ; CI-NEXT: flat_store_dword v[0:1], v2
580 ; VI-LABEL: s_test_imin_sle_v4i8:
582 ; VI-NEXT: s_load_dword s2, s[8:9], 0x28
583 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
584 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
585 ; VI-NEXT: s_waitcnt lgkmcnt(0)
586 ; VI-NEXT: s_ashr_i32 s4, s2, 24
587 ; VI-NEXT: s_bfe_i32 s5, s2, 0x80010
588 ; VI-NEXT: s_bfe_i32 s6, s2, 0x80008
589 ; VI-NEXT: s_sext_i32_i8 s2, s2
590 ; VI-NEXT: s_ashr_i32 s7, s3, 24
591 ; VI-NEXT: s_bfe_i32 s8, s3, 0x80010
592 ; VI-NEXT: s_bfe_i32 s9, s3, 0x80008
593 ; VI-NEXT: s_sext_i32_i8 s3, s3
594 ; VI-NEXT: s_min_i32 s2, s2, s3
595 ; VI-NEXT: s_min_i32 s3, s6, s9
596 ; VI-NEXT: s_min_i32 s5, s5, s8
597 ; VI-NEXT: s_min_i32 s4, s4, s7
598 ; VI-NEXT: s_and_b32 s5, s5, 0xff
599 ; VI-NEXT: s_lshl_b32 s3, s3, 8
600 ; VI-NEXT: s_and_b32 s2, s2, 0xff
601 ; VI-NEXT: s_lshl_b32 s4, s4, 24
602 ; VI-NEXT: s_lshl_b32 s5, s5, 16
603 ; VI-NEXT: s_or_b32 s2, s2, s3
604 ; VI-NEXT: s_or_b32 s4, s4, s5
605 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
606 ; VI-NEXT: s_or_b32 s2, s2, s4
607 ; VI-NEXT: v_mov_b32_e32 v0, s0
608 ; VI-NEXT: v_mov_b32_e32 v1, s1
609 ; VI-NEXT: v_mov_b32_e32 v2, s2
610 ; VI-NEXT: flat_store_dword v[0:1], v2
613 ; GFX9-LABEL: s_test_imin_sle_v4i8:
615 ; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c
616 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28
617 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
618 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
619 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
620 ; GFX9-NEXT: s_lshr_b32 s5, s2, 16
621 ; GFX9-NEXT: s_lshr_b32 s8, s3, 16
622 ; GFX9-NEXT: s_ashr_i32 s9, s3, 24
623 ; GFX9-NEXT: s_ashr_i32 s6, s2, 24
624 ; GFX9-NEXT: s_bfe_i32 s8, s8, 0x80000
625 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
626 ; GFX9-NEXT: s_bfe_i32 s5, s5, 0x80000
627 ; GFX9-NEXT: s_sext_i32_i16 s7, s3
628 ; GFX9-NEXT: v_min_i16_e32 v1, s6, v1
629 ; GFX9-NEXT: v_mov_b32_e32 v2, s8
630 ; GFX9-NEXT: s_sext_i32_i16 s4, s2
631 ; GFX9-NEXT: s_lshr_b32 s7, s7, 8
632 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
633 ; GFX9-NEXT: v_min_i16_e32 v2, s5, v2
634 ; GFX9-NEXT: s_lshr_b32 s4, s4, 8
635 ; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80000
636 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
637 ; GFX9-NEXT: v_mov_b32_e32 v2, s7
638 ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80000
639 ; GFX9-NEXT: v_min_i16_e32 v2, s4, v2
640 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
641 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
642 ; GFX9-NEXT: v_min_i16_e32 v3, s2, v3
643 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
644 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
645 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
646 ; GFX9-NEXT: s_endpgm
648 ; GFX10-LABEL: s_test_imin_sle_v4i8:
650 ; GFX10-NEXT: s_clause 0x2
651 ; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28
652 ; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c
653 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
654 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
655 ; GFX10-NEXT: s_sext_i32_i16 s4, s2
656 ; GFX10-NEXT: s_sext_i32_i16 s7, s3
657 ; GFX10-NEXT: s_ashr_i32 s6, s2, 24
658 ; GFX10-NEXT: s_ashr_i32 s9, s3, 24
659 ; GFX10-NEXT: s_lshr_b32 s4, s4, 8
660 ; GFX10-NEXT: s_lshr_b32 s7, s7, 8
661 ; GFX10-NEXT: v_min_i16 v0, s6, s9
662 ; GFX10-NEXT: v_min_i16 v1, s4, s7
663 ; GFX10-NEXT: s_lshr_b32 s5, s2, 16
664 ; GFX10-NEXT: s_lshr_b32 s8, s3, 16
665 ; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80000
666 ; GFX10-NEXT: s_bfe_i32 s5, s5, 0x80000
667 ; GFX10-NEXT: s_bfe_i32 s4, s8, 0x80000
668 ; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80000
669 ; GFX10-NEXT: v_min_i16 v2, s5, s4
670 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
671 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1
672 ; GFX10-NEXT: v_min_i16 v3, s2, s3
673 ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
674 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
675 ; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
676 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
677 ; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
678 ; GFX10-NEXT: s_endpgm
680 ; GFX11-LABEL: s_test_imin_sle_v4i8:
682 ; GFX11-NEXT: s_clause 0x1
683 ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x28
684 ; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x4c
685 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
686 ; GFX11-NEXT: s_sext_i32_i16 s2, s0
687 ; GFX11-NEXT: s_lshr_b32 s3, s0, 16
688 ; GFX11-NEXT: s_sext_i32_i16 s7, s1
689 ; GFX11-NEXT: s_lshr_b32 s8, s1, 16
690 ; GFX11-NEXT: s_ashr_i32 s6, s0, 24
691 ; GFX11-NEXT: s_bfe_i32 s0, s0, 0x80000
692 ; GFX11-NEXT: s_ashr_i32 s9, s1, 24
693 ; GFX11-NEXT: s_bfe_i32 s1, s1, 0x80000
694 ; GFX11-NEXT: s_lshr_b32 s2, s2, 8
695 ; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80000
696 ; GFX11-NEXT: s_lshr_b32 s7, s7, 8
697 ; GFX11-NEXT: s_bfe_i32 s8, s8, 0x80000
698 ; GFX11-NEXT: v_min_i16 v0, s6, s9
699 ; GFX11-NEXT: v_min_i16 v1, s0, s1
700 ; GFX11-NEXT: v_min_i16 v2, s3, s8
701 ; GFX11-NEXT: v_min_i16 v3, s2, s7
702 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
703 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0
704 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
705 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
706 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3
707 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
708 ; GFX11-NEXT: v_or_b32_e32 v0, v2, v0
709 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
710 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
711 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
712 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
713 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
714 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
715 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
716 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
717 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
718 ; GFX11-NEXT: s_endpgm
719 %cmp = icmp sle <4 x i8> %a, %b
720 %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
721 store <4 x i8> %val, ptr addrspace(1) %out
725 define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #0 {
726 ; EG-LABEL: s_test_imin_sle_v2i16:
728 ; EG-NEXT: ALU 0, @14, KC0[], KC1[]
730 ; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[]
731 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
734 ; EG-NEXT: Fetch clause starting at 6:
735 ; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
736 ; EG-NEXT: VTX_READ_16 T6.X, T4.X, 44, #3
737 ; EG-NEXT: VTX_READ_16 T7.X, T4.X, 40, #3
738 ; EG-NEXT: VTX_READ_16 T4.X, T4.X, 46, #3
739 ; EG-NEXT: ALU clause starting at 14:
740 ; EG-NEXT: MOV * T4.X, 0.0,
741 ; EG-NEXT: ALU clause starting at 15:
742 ; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x,
743 ; EG-NEXT: BFE_INT T0.Y, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212
744 ; EG-NEXT: BFE_INT * T0.Z, T7.X, 0.0, literal.x, BS:VEC_201
745 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
746 ; EG-NEXT: BFE_INT * T0.W, T6.X, 0.0, literal.x,
747 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
748 ; EG-NEXT: MIN_INT T0.W, T0.Z, PV.W,
749 ; EG-NEXT: MIN_INT * T1.W, T5.X, T0.Y,
750 ; EG-NEXT: LSHL T1.W, PS, literal.x,
751 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
752 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
753 ; EG-NEXT: OR_INT T4.X, PV.W, PS,
754 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
755 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
757 ; CI-LABEL: s_test_imin_sle_v2i16:
759 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
760 ; CI-NEXT: s_waitcnt lgkmcnt(0)
761 ; CI-NEXT: s_ashr_i32 s4, s2, 16
762 ; CI-NEXT: s_sext_i32_i16 s2, s2
763 ; CI-NEXT: s_ashr_i32 s5, s3, 16
764 ; CI-NEXT: s_sext_i32_i16 s3, s3
765 ; CI-NEXT: s_min_i32 s4, s4, s5
766 ; CI-NEXT: s_min_i32 s2, s2, s3
767 ; CI-NEXT: s_lshl_b32 s3, s4, 16
768 ; CI-NEXT: s_and_b32 s2, s2, 0xffff
769 ; CI-NEXT: s_or_b32 s2, s2, s3
770 ; CI-NEXT: v_mov_b32_e32 v0, s0
771 ; CI-NEXT: v_mov_b32_e32 v1, s1
772 ; CI-NEXT: v_mov_b32_e32 v2, s2
773 ; CI-NEXT: flat_store_dword v[0:1], v2
776 ; VI-LABEL: s_test_imin_sle_v2i16:
778 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
779 ; VI-NEXT: s_waitcnt lgkmcnt(0)
780 ; VI-NEXT: s_ashr_i32 s4, s2, 16
781 ; VI-NEXT: s_sext_i32_i16 s2, s2
782 ; VI-NEXT: s_ashr_i32 s5, s3, 16
783 ; VI-NEXT: s_sext_i32_i16 s3, s3
784 ; VI-NEXT: s_min_i32 s4, s4, s5
785 ; VI-NEXT: s_min_i32 s2, s2, s3
786 ; VI-NEXT: s_lshl_b32 s3, s4, 16
787 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
788 ; VI-NEXT: s_or_b32 s2, s2, s3
789 ; VI-NEXT: v_mov_b32_e32 v0, s0
790 ; VI-NEXT: v_mov_b32_e32 v1, s1
791 ; VI-NEXT: v_mov_b32_e32 v2, s2
792 ; VI-NEXT: flat_store_dword v[0:1], v2
795 ; GFX9-LABEL: s_test_imin_sle_v2i16:
797 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
798 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
799 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
800 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
801 ; GFX9-NEXT: v_pk_min_i16 v1, s2, v1
802 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
803 ; GFX9-NEXT: s_endpgm
805 ; GFX10-LABEL: s_test_imin_sle_v2i16:
807 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
808 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
809 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
810 ; GFX10-NEXT: v_pk_min_i16 v1, s2, s3
811 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
812 ; GFX10-NEXT: s_endpgm
814 ; GFX11-LABEL: s_test_imin_sle_v2i16:
816 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
817 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
818 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
819 ; GFX11-NEXT: v_pk_min_i16 v1, s2, s3
820 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
821 ; GFX11-NEXT: s_endpgm
822 %cmp = icmp sle <2 x i16> %a, %b
823 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
824 store <2 x i16> %val, ptr addrspace(1) %out
828 define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16> %a, <4 x i16> %b) #0 {
829 ; EG-LABEL: s_test_imin_sle_v4i16:
831 ; EG-NEXT: ALU 1, @28, KC0[], KC1[]
833 ; EG-NEXT: ALU 9, @30, KC0[], KC1[]
835 ; EG-NEXT: ALU 10, @40, KC0[], KC1[]
837 ; EG-NEXT: ALU 10, @51, KC0[], KC1[]
839 ; EG-NEXT: ALU 11, @62, KC0[CB0:0-32], KC1[]
840 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XY, T5.X, 1
843 ; EG-NEXT: Fetch clause starting at 12:
844 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3
845 ; EG-NEXT: VTX_READ_16 T7.X, T5.X, 58, #3
846 ; EG-NEXT: Fetch clause starting at 16:
847 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3
848 ; EG-NEXT: VTX_READ_16 T7.X, T5.X, 56, #3
849 ; EG-NEXT: Fetch clause starting at 20:
850 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3
851 ; EG-NEXT: VTX_READ_16 T7.X, T5.X, 54, #3
852 ; EG-NEXT: Fetch clause starting at 24:
853 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
854 ; EG-NEXT: VTX_READ_16 T5.X, T5.X, 52, #3
855 ; EG-NEXT: ALU clause starting at 28:
856 ; EG-NEXT: MOV * T0.Y, T3.X,
857 ; EG-NEXT: MOV * T5.X, 0.0,
858 ; EG-NEXT: ALU clause starting at 30:
859 ; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
860 ; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
861 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
862 ; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
863 ; EG-NEXT: LSHL T0.W, PV.W, literal.x,
864 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
865 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
866 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
867 ; EG-NEXT: MOV * T3.X, PV.W,
868 ; EG-NEXT: MOV * T0.Y, PV.X,
869 ; EG-NEXT: ALU clause starting at 40:
870 ; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
871 ; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
872 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
873 ; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W,
874 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
875 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
876 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
877 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
878 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
879 ; EG-NEXT: MOV T3.X, PV.W,
880 ; EG-NEXT: MOV * T0.Y, T2.X,
881 ; EG-NEXT: ALU clause starting at 51:
882 ; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
883 ; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
884 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
885 ; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W,
886 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
887 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
888 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
889 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
890 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
891 ; EG-NEXT: MOV * T2.X, PV.W,
892 ; EG-NEXT: MOV * T0.Y, PV.X,
893 ; EG-NEXT: ALU clause starting at 62:
894 ; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
895 ; EG-NEXT: BFE_INT * T0.W, T5.X, 0.0, literal.x, BS:VEC_120/SCL_212
896 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
897 ; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
898 ; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
899 ; EG-NEXT: AND_INT T1.W, T0.Y, literal.y,
900 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.z,
901 ; EG-NEXT: 2(2.802597e-45), -65536(nan)
902 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
903 ; EG-NEXT: OR_INT * T6.X, PV.W, PS,
904 ; EG-NEXT: MOV T2.X, PV.X,
905 ; EG-NEXT: MOV * T6.Y, T3.X,
907 ; CI-LABEL: s_test_imin_sle_v4i16:
909 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
910 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
911 ; CI-NEXT: s_waitcnt lgkmcnt(0)
912 ; CI-NEXT: s_ashr_i32 s6, s0, 16
913 ; CI-NEXT: s_ashr_i32 s7, s1, 16
914 ; CI-NEXT: s_sext_i32_i16 s0, s0
915 ; CI-NEXT: s_sext_i32_i16 s1, s1
916 ; CI-NEXT: s_ashr_i32 s8, s2, 16
917 ; CI-NEXT: s_ashr_i32 s9, s3, 16
918 ; CI-NEXT: s_sext_i32_i16 s2, s2
919 ; CI-NEXT: s_sext_i32_i16 s3, s3
920 ; CI-NEXT: s_min_i32 s7, s7, s9
921 ; CI-NEXT: s_min_i32 s1, s1, s3
922 ; CI-NEXT: s_min_i32 s3, s6, s8
923 ; CI-NEXT: s_min_i32 s0, s0, s2
924 ; CI-NEXT: s_lshl_b32 s7, s7, 16
925 ; CI-NEXT: s_and_b32 s1, s1, 0xffff
926 ; CI-NEXT: s_lshl_b32 s3, s3, 16
927 ; CI-NEXT: s_and_b32 s0, s0, 0xffff
928 ; CI-NEXT: s_or_b32 s1, s1, s7
929 ; CI-NEXT: s_or_b32 s0, s0, s3
930 ; CI-NEXT: v_mov_b32_e32 v2, s4
931 ; CI-NEXT: v_mov_b32_e32 v0, s0
932 ; CI-NEXT: v_mov_b32_e32 v1, s1
933 ; CI-NEXT: v_mov_b32_e32 v3, s5
934 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
937 ; VI-LABEL: s_test_imin_sle_v4i16:
939 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
940 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
941 ; VI-NEXT: s_waitcnt lgkmcnt(0)
942 ; VI-NEXT: s_ashr_i32 s6, s1, 16
943 ; VI-NEXT: s_sext_i32_i16 s1, s1
944 ; VI-NEXT: s_ashr_i32 s8, s3, 16
945 ; VI-NEXT: s_sext_i32_i16 s3, s3
946 ; VI-NEXT: s_ashr_i32 s7, s0, 16
947 ; VI-NEXT: s_sext_i32_i16 s0, s0
948 ; VI-NEXT: s_ashr_i32 s9, s2, 16
949 ; VI-NEXT: s_sext_i32_i16 s2, s2
950 ; VI-NEXT: s_min_i32 s6, s6, s8
951 ; VI-NEXT: s_min_i32 s1, s1, s3
952 ; VI-NEXT: s_min_i32 s7, s7, s9
953 ; VI-NEXT: s_min_i32 s0, s0, s2
954 ; VI-NEXT: s_lshl_b32 s2, s6, 16
955 ; VI-NEXT: s_and_b32 s1, s1, 0xffff
956 ; VI-NEXT: s_or_b32 s1, s1, s2
957 ; VI-NEXT: s_lshl_b32 s2, s7, 16
958 ; VI-NEXT: s_and_b32 s0, s0, 0xffff
959 ; VI-NEXT: s_or_b32 s0, s0, s2
960 ; VI-NEXT: v_mov_b32_e32 v2, s4
961 ; VI-NEXT: v_mov_b32_e32 v0, s0
962 ; VI-NEXT: v_mov_b32_e32 v1, s1
963 ; VI-NEXT: v_mov_b32_e32 v3, s5
964 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
967 ; GFX9-LABEL: s_test_imin_sle_v4i16:
969 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
970 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
971 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
972 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
973 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
974 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
975 ; GFX9-NEXT: v_pk_min_i16 v1, s1, v0
976 ; GFX9-NEXT: v_pk_min_i16 v0, s0, v3
977 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
978 ; GFX9-NEXT: s_endpgm
980 ; GFX10-LABEL: s_test_imin_sle_v4i16:
982 ; GFX10-NEXT: s_clause 0x1
983 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
984 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
985 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
986 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
987 ; GFX10-NEXT: v_pk_min_i16 v1, s1, s3
988 ; GFX10-NEXT: v_pk_min_i16 v0, s0, s2
989 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
990 ; GFX10-NEXT: s_endpgm
992 ; GFX11-LABEL: s_test_imin_sle_v4i16:
994 ; GFX11-NEXT: s_clause 0x1
995 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
996 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
997 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
998 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
999 ; GFX11-NEXT: v_pk_min_i16 v1, s1, s3
1000 ; GFX11-NEXT: v_pk_min_i16 v0, s0, s2
1001 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
1002 ; GFX11-NEXT: s_endpgm
1003 %cmp = icmp sle <4 x i16> %a, %b
1004 %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
1005 store <4 x i16> %val, ptr addrspace(1) %out
1009 define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
1010 ; EG-LABEL: v_test_imin_slt_i32:
1012 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
1014 ; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
1015 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1018 ; EG-NEXT: Fetch clause starting at 6:
1019 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
1020 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1021 ; EG-NEXT: ALU clause starting at 10:
1022 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1023 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1024 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
1025 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
1026 ; EG-NEXT: ALU clause starting at 14:
1027 ; EG-NEXT: MIN_INT T0.X, T0.X, T1.X,
1028 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1029 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
1030 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1032 ; CI-LABEL: v_test_imin_slt_i32:
1034 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1035 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
1036 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1037 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1038 ; CI-NEXT: v_mov_b32_e32 v1, s3
1039 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
1040 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1041 ; CI-NEXT: v_mov_b32_e32 v3, s5
1042 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
1043 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1044 ; CI-NEXT: flat_load_dword v5, v[0:1]
1045 ; CI-NEXT: flat_load_dword v2, v[2:3]
1046 ; CI-NEXT: v_mov_b32_e32 v1, s1
1047 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
1048 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1049 ; CI-NEXT: s_waitcnt vmcnt(0)
1050 ; CI-NEXT: v_min_i32_e32 v2, v5, v2
1051 ; CI-NEXT: flat_store_dword v[0:1], v2
1054 ; VI-LABEL: v_test_imin_slt_i32:
1056 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1057 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1058 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1059 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1060 ; VI-NEXT: v_mov_b32_e32 v1, s3
1061 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1062 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1063 ; VI-NEXT: v_mov_b32_e32 v3, s5
1064 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1065 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1066 ; VI-NEXT: flat_load_dword v5, v[0:1]
1067 ; VI-NEXT: flat_load_dword v2, v[2:3]
1068 ; VI-NEXT: v_mov_b32_e32 v1, s1
1069 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
1070 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1071 ; VI-NEXT: s_waitcnt vmcnt(0)
1072 ; VI-NEXT: v_min_i32_e32 v2, v5, v2
1073 ; VI-NEXT: flat_store_dword v[0:1], v2
1076 ; GFX9-LABEL: v_test_imin_slt_i32:
1078 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1079 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1080 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1081 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1082 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1083 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
1084 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1085 ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2
1086 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1087 ; GFX9-NEXT: s_endpgm
1089 ; GFX10-LABEL: v_test_imin_slt_i32:
1091 ; GFX10-NEXT: s_clause 0x1
1092 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1093 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1094 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1095 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1096 ; GFX10-NEXT: s_clause 0x1
1097 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1098 ; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
1099 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1100 ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2
1101 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1102 ; GFX10-NEXT: s_endpgm
1104 ; GFX11-LABEL: v_test_imin_slt_i32:
1106 ; GFX11-NEXT: s_clause 0x1
1107 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1108 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
1109 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1110 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1111 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1112 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1113 ; GFX11-NEXT: s_clause 0x1
1114 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1115 ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
1116 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1117 ; GFX11-NEXT: v_min_i32_e32 v1, v1, v2
1118 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1119 ; GFX11-NEXT: s_endpgm
1120 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1121 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
1122 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
1123 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
1124 %a = load i32, ptr addrspace(1) %a.gep, align 4
1125 %b = load i32, ptr addrspace(1) %b.gep, align 4
1126 %cmp = icmp slt i32 %a, %b
1127 %val = select i1 %cmp, i32 %a, i32 %b
1128 store i32 %val, ptr addrspace(1) %out.gep, align 4
1132 define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
1133 ; EG-LABEL: v_test_imin_slt_i16:
1135 ; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[]
1137 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
1138 ; EG-NEXT: TEX 0 @10
1139 ; EG-NEXT: ALU 16, @15, KC0[CB0:0-32], KC1[]
1140 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X
1143 ; EG-NEXT: Fetch clause starting at 8:
1144 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1145 ; EG-NEXT: Fetch clause starting at 10:
1146 ; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
1147 ; EG-NEXT: ALU clause starting at 12:
1148 ; EG-NEXT: LSHL * T0.W, T0.X, 1,
1149 ; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W,
1150 ; EG-NEXT: ALU clause starting at 14:
1151 ; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, T0.W,
1152 ; EG-NEXT: ALU clause starting at 15:
1153 ; EG-NEXT: BFE_INT T0.Z, T0.X, 0.0, literal.x,
1154 ; EG-NEXT: BFE_INT T1.W, T1.X, 0.0, literal.x, BS:VEC_120/SCL_212
1155 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1156 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1157 ; EG-NEXT: AND_INT T2.W, PS, literal.x,
1158 ; EG-NEXT: MIN_INT * T1.W, PV.W, PV.Z,
1159 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1160 ; EG-NEXT: AND_INT T1.W, PS, literal.x,
1161 ; EG-NEXT: LSHL * T2.W, PV.W, literal.y,
1162 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1163 ; EG-NEXT: LSHL T1.X, PV.W, PS,
1164 ; EG-NEXT: LSHL * T1.W, literal.x, PS,
1165 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1166 ; EG-NEXT: MOV T1.Y, 0.0,
1167 ; EG-NEXT: MOV * T1.Z, 0.0,
1168 ; EG-NEXT: LSHR * T0.X, T0.W, literal.x,
1169 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1171 ; CI-LABEL: v_test_imin_slt_i16:
1173 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1174 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
1175 ; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
1176 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1177 ; CI-NEXT: v_mov_b32_e32 v1, s3
1178 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
1179 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1180 ; CI-NEXT: v_mov_b32_e32 v3, s5
1181 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
1182 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1183 ; CI-NEXT: flat_load_sshort v5, v[0:1]
1184 ; CI-NEXT: flat_load_sshort v2, v[2:3]
1185 ; CI-NEXT: v_mov_b32_e32 v1, s1
1186 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
1187 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1188 ; CI-NEXT: s_waitcnt vmcnt(0)
1189 ; CI-NEXT: v_min_i32_e32 v2, v5, v2
1190 ; CI-NEXT: flat_store_short v[0:1], v2
1193 ; VI-LABEL: v_test_imin_slt_i16:
1195 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1196 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1197 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
1198 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1199 ; VI-NEXT: v_mov_b32_e32 v1, s3
1200 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1201 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1202 ; VI-NEXT: v_mov_b32_e32 v3, s5
1203 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1204 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1205 ; VI-NEXT: flat_load_ushort v5, v[0:1]
1206 ; VI-NEXT: flat_load_ushort v2, v[2:3]
1207 ; VI-NEXT: v_mov_b32_e32 v1, s1
1208 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
1209 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1210 ; VI-NEXT: s_waitcnt vmcnt(0)
1211 ; VI-NEXT: v_min_i16_e32 v2, v5, v2
1212 ; VI-NEXT: flat_store_short v[0:1], v2
1215 ; GFX9-LABEL: v_test_imin_slt_i16:
1217 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1218 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1219 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1220 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1221 ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
1222 ; GFX9-NEXT: global_load_ushort v2, v0, s[4:5]
1223 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1224 ; GFX9-NEXT: v_min_i16_e32 v1, v1, v2
1225 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
1226 ; GFX9-NEXT: s_endpgm
1228 ; GFX10-LABEL: v_test_imin_slt_i16:
1230 ; GFX10-NEXT: s_clause 0x1
1231 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1232 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1233 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1234 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1235 ; GFX10-NEXT: s_clause 0x1
1236 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
1237 ; GFX10-NEXT: global_load_ushort v2, v0, s[4:5]
1238 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1239 ; GFX10-NEXT: v_min_i16 v1, v1, v2
1240 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
1241 ; GFX10-NEXT: s_endpgm
1243 ; GFX11-LABEL: v_test_imin_slt_i16:
1245 ; GFX11-NEXT: s_clause 0x1
1246 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1247 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
1248 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1249 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1250 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1251 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1252 ; GFX11-NEXT: s_clause 0x1
1253 ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
1254 ; GFX11-NEXT: global_load_u16 v2, v0, s[4:5]
1255 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1256 ; GFX11-NEXT: v_min_i16 v1, v1, v2
1257 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
1258 ; GFX11-NEXT: s_endpgm
1259 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1260 %a.gep = getelementptr inbounds i16, ptr addrspace(1) %aptr, i32 %tid
1261 %b.gep = getelementptr inbounds i16, ptr addrspace(1) %bptr, i32 %tid
1262 %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
1264 %a = load i16, ptr addrspace(1) %a.gep
1265 %b = load i16, ptr addrspace(1) %b.gep
1266 %cmp = icmp slt i16 %a, %b
1267 %val = select i1 %cmp, i16 %a, i16 %b
1268 store i16 %val, ptr addrspace(1) %out.gep
1272 define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
1273 ; EG-LABEL: s_test_imin_slt_i32:
1275 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1276 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1279 ; EG-NEXT: ALU clause starting at 4:
1280 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
1281 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
1282 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1284 ; CI-LABEL: s_test_imin_slt_i32:
1286 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1287 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1288 ; CI-NEXT: s_min_i32 s2, s2, s3
1289 ; CI-NEXT: v_mov_b32_e32 v0, s0
1290 ; CI-NEXT: v_mov_b32_e32 v1, s1
1291 ; CI-NEXT: v_mov_b32_e32 v2, s2
1292 ; CI-NEXT: flat_store_dword v[0:1], v2
1295 ; VI-LABEL: s_test_imin_slt_i32:
1297 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1298 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1299 ; VI-NEXT: s_min_i32 s2, s2, s3
1300 ; VI-NEXT: v_mov_b32_e32 v0, s0
1301 ; VI-NEXT: v_mov_b32_e32 v1, s1
1302 ; VI-NEXT: v_mov_b32_e32 v2, s2
1303 ; VI-NEXT: flat_store_dword v[0:1], v2
1306 ; GFX9-LABEL: s_test_imin_slt_i32:
1308 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1309 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1310 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1311 ; GFX9-NEXT: s_min_i32 s2, s2, s3
1312 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1313 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1314 ; GFX9-NEXT: s_endpgm
1316 ; GFX10-LABEL: s_test_imin_slt_i32:
1318 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1319 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1320 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1321 ; GFX10-NEXT: s_min_i32 s2, s2, s3
1322 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
1323 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1324 ; GFX10-NEXT: s_endpgm
1326 ; GFX11-LABEL: s_test_imin_slt_i32:
1328 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1329 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1330 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1331 ; GFX11-NEXT: s_min_i32 s2, s2, s3
1332 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1333 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
1334 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1335 ; GFX11-NEXT: s_endpgm
1336 %cmp = icmp slt i32 %a, %b
1337 %val = select i1 %cmp, i32 %a, i32 %b
1338 store i32 %val, ptr addrspace(1) %out, align 4
1342 define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
1343 ; EG-LABEL: s_test_imin_slt_v2i32:
1345 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
1346 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1349 ; EG-NEXT: ALU clause starting at 4:
1350 ; EG-NEXT: MIN_INT * T0.Y, KC0[3].X, KC0[3].Z,
1351 ; EG-NEXT: MIN_INT * T0.X, KC0[2].W, KC0[3].Y,
1352 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1353 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1355 ; CI-LABEL: s_test_imin_slt_v2i32:
1357 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
1358 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1359 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1360 ; CI-NEXT: s_min_i32 s1, s1, s3
1361 ; CI-NEXT: s_min_i32 s0, s0, s2
1362 ; CI-NEXT: v_mov_b32_e32 v2, s4
1363 ; CI-NEXT: v_mov_b32_e32 v0, s0
1364 ; CI-NEXT: v_mov_b32_e32 v1, s1
1365 ; CI-NEXT: v_mov_b32_e32 v3, s5
1366 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1369 ; VI-LABEL: s_test_imin_slt_v2i32:
1371 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
1372 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1373 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1374 ; VI-NEXT: s_min_i32 s1, s1, s3
1375 ; VI-NEXT: s_min_i32 s0, s0, s2
1376 ; VI-NEXT: v_mov_b32_e32 v2, s4
1377 ; VI-NEXT: v_mov_b32_e32 v0, s0
1378 ; VI-NEXT: v_mov_b32_e32 v1, s1
1379 ; VI-NEXT: v_mov_b32_e32 v3, s5
1380 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1383 ; GFX9-LABEL: s_test_imin_slt_v2i32:
1385 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
1386 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1387 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1388 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1389 ; GFX9-NEXT: s_min_i32 s1, s1, s3
1390 ; GFX9-NEXT: s_min_i32 s0, s0, s2
1391 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1392 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1393 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
1394 ; GFX9-NEXT: s_endpgm
1396 ; GFX10-LABEL: s_test_imin_slt_v2i32:
1398 ; GFX10-NEXT: s_clause 0x1
1399 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
1400 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1401 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1402 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1403 ; GFX10-NEXT: s_min_i32 s0, s0, s2
1404 ; GFX10-NEXT: s_min_i32 s1, s1, s3
1405 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1406 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1407 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
1408 ; GFX10-NEXT: s_endpgm
1410 ; GFX11-LABEL: s_test_imin_slt_v2i32:
1412 ; GFX11-NEXT: s_clause 0x1
1413 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
1414 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
1415 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1416 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1417 ; GFX11-NEXT: s_min_i32 s0, s0, s2
1418 ; GFX11-NEXT: s_min_i32 s1, s1, s3
1419 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
1420 ; GFX11-NEXT: v_mov_b32_e32 v1, s1
1421 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
1422 ; GFX11-NEXT: s_endpgm
1423 %cmp = icmp slt <2 x i32> %a, %b
1424 %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
1425 store <2 x i32> %val, ptr addrspace(1) %out
1429 define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a) #0 {
1430 ; EG-LABEL: s_test_imin_slt_imm_i32:
1432 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1433 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1436 ; EG-NEXT: ALU clause starting at 4:
1437 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
1438 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, literal.y,
1439 ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
1441 ; CI-LABEL: s_test_imin_slt_imm_i32:
1443 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2
1444 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1445 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1446 ; CI-NEXT: s_min_i32 s2, s2, 8
1447 ; CI-NEXT: v_mov_b32_e32 v0, s0
1448 ; CI-NEXT: v_mov_b32_e32 v1, s1
1449 ; CI-NEXT: v_mov_b32_e32 v2, s2
1450 ; CI-NEXT: flat_store_dword v[0:1], v2
1453 ; VI-LABEL: s_test_imin_slt_imm_i32:
1455 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
1456 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1457 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1458 ; VI-NEXT: s_min_i32 s2, s2, 8
1459 ; VI-NEXT: v_mov_b32_e32 v0, s0
1460 ; VI-NEXT: v_mov_b32_e32 v1, s1
1461 ; VI-NEXT: v_mov_b32_e32 v2, s2
1462 ; VI-NEXT: flat_store_dword v[0:1], v2
1465 ; GFX9-LABEL: s_test_imin_slt_imm_i32:
1467 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
1468 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1469 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1470 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1471 ; GFX9-NEXT: s_min_i32 s2, s2, 8
1472 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1473 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1474 ; GFX9-NEXT: s_endpgm
1476 ; GFX10-LABEL: s_test_imin_slt_imm_i32:
1478 ; GFX10-NEXT: s_clause 0x1
1479 ; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
1480 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1481 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1482 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1483 ; GFX10-NEXT: s_min_i32 s2, s2, 8
1484 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
1485 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1486 ; GFX10-NEXT: s_endpgm
1488 ; GFX11-LABEL: s_test_imin_slt_imm_i32:
1490 ; GFX11-NEXT: s_clause 0x1
1491 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
1492 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
1493 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1494 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1495 ; GFX11-NEXT: s_min_i32 s2, s2, 8
1496 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1497 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
1498 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1499 ; GFX11-NEXT: s_endpgm
1500 %cmp = icmp slt i32 %a, 8
1501 %val = select i1 %cmp, i32 %a, i32 8
1502 store i32 %val, ptr addrspace(1) %out, align 4
1506 define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a) #0 {
1507 ; EG-LABEL: s_test_imin_sle_imm_i32:
1509 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1510 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1513 ; EG-NEXT: ALU clause starting at 4:
1514 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
1515 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, literal.y,
1516 ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
1518 ; CI-LABEL: s_test_imin_sle_imm_i32:
1520 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2
1521 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1522 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1523 ; CI-NEXT: s_min_i32 s2, s2, 8
1524 ; CI-NEXT: v_mov_b32_e32 v0, s0
1525 ; CI-NEXT: v_mov_b32_e32 v1, s1
1526 ; CI-NEXT: v_mov_b32_e32 v2, s2
1527 ; CI-NEXT: flat_store_dword v[0:1], v2
1530 ; VI-LABEL: s_test_imin_sle_imm_i32:
1532 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
1533 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1534 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1535 ; VI-NEXT: s_min_i32 s2, s2, 8
1536 ; VI-NEXT: v_mov_b32_e32 v0, s0
1537 ; VI-NEXT: v_mov_b32_e32 v1, s1
1538 ; VI-NEXT: v_mov_b32_e32 v2, s2
1539 ; VI-NEXT: flat_store_dword v[0:1], v2
1542 ; GFX9-LABEL: s_test_imin_sle_imm_i32:
1544 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
1545 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1546 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1547 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1548 ; GFX9-NEXT: s_min_i32 s2, s2, 8
1549 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1550 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1551 ; GFX9-NEXT: s_endpgm
1553 ; GFX10-LABEL: s_test_imin_sle_imm_i32:
1555 ; GFX10-NEXT: s_clause 0x1
1556 ; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
1557 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1558 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1559 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1560 ; GFX10-NEXT: s_min_i32 s2, s2, 8
1561 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
1562 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1563 ; GFX10-NEXT: s_endpgm
1565 ; GFX11-LABEL: s_test_imin_sle_imm_i32:
1567 ; GFX11-NEXT: s_clause 0x1
1568 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
1569 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
1570 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1571 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1572 ; GFX11-NEXT: s_min_i32 s2, s2, 8
1573 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1574 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
1575 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1576 ; GFX11-NEXT: s_endpgm
1577 %cmp = icmp sle i32 %a, 8
1578 %val = select i1 %cmp, i32 %a, i32 8
1579 store i32 %val, ptr addrspace(1) %out, align 4
1583 define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1584 ; EG-LABEL: v_test_umin_ule_i32:
1586 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
1588 ; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
1589 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1592 ; EG-NEXT: Fetch clause starting at 6:
1593 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
1594 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1595 ; EG-NEXT: ALU clause starting at 10:
1596 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1597 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1598 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
1599 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
1600 ; EG-NEXT: ALU clause starting at 14:
1601 ; EG-NEXT: MIN_UINT T0.X, T0.X, T1.X,
1602 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1603 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
1604 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1606 ; CI-LABEL: v_test_umin_ule_i32:
1608 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1609 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
1610 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1611 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1612 ; CI-NEXT: v_mov_b32_e32 v1, s3
1613 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
1614 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1615 ; CI-NEXT: v_mov_b32_e32 v3, s5
1616 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
1617 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1618 ; CI-NEXT: flat_load_dword v5, v[0:1]
1619 ; CI-NEXT: flat_load_dword v2, v[2:3]
1620 ; CI-NEXT: v_mov_b32_e32 v1, s1
1621 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
1622 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1623 ; CI-NEXT: s_waitcnt vmcnt(0)
1624 ; CI-NEXT: v_min_u32_e32 v2, v5, v2
1625 ; CI-NEXT: flat_store_dword v[0:1], v2
1628 ; VI-LABEL: v_test_umin_ule_i32:
1630 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1631 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1632 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1633 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1634 ; VI-NEXT: v_mov_b32_e32 v1, s3
1635 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1636 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1637 ; VI-NEXT: v_mov_b32_e32 v3, s5
1638 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1639 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1640 ; VI-NEXT: flat_load_dword v5, v[0:1]
1641 ; VI-NEXT: flat_load_dword v2, v[2:3]
1642 ; VI-NEXT: v_mov_b32_e32 v1, s1
1643 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
1644 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1645 ; VI-NEXT: s_waitcnt vmcnt(0)
1646 ; VI-NEXT: v_min_u32_e32 v2, v5, v2
1647 ; VI-NEXT: flat_store_dword v[0:1], v2
1650 ; GFX9-LABEL: v_test_umin_ule_i32:
1652 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1653 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1654 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1655 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1656 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1657 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
1658 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1659 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2
1660 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1661 ; GFX9-NEXT: s_endpgm
1663 ; GFX10-LABEL: v_test_umin_ule_i32:
1665 ; GFX10-NEXT: s_clause 0x1
1666 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1667 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1668 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1669 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1670 ; GFX10-NEXT: s_clause 0x1
1671 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1672 ; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
1673 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1674 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2
1675 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1676 ; GFX10-NEXT: s_endpgm
1678 ; GFX11-LABEL: v_test_umin_ule_i32:
1680 ; GFX11-NEXT: s_clause 0x1
1681 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1682 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
1683 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1684 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1685 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1686 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1687 ; GFX11-NEXT: s_clause 0x1
1688 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1689 ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
1690 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1691 ; GFX11-NEXT: v_min_u32_e32 v1, v1, v2
1692 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1693 ; GFX11-NEXT: s_endpgm
1694 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1695 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
1696 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
1697 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
1698 %a = load i32, ptr addrspace(1) %a.gep, align 4
1699 %b = load i32, ptr addrspace(1) %b.gep, align 4
1700 %cmp = icmp ule i32 %a, %b
1701 %val = select i1 %cmp, i32 %a, i32 %b
1702 store i32 %val, ptr addrspace(1) %out.gep, align 4
1706 define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1707 ; EG-LABEL: v_test_umin_ule_v3i32:
1709 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
1711 ; EG-NEXT: ALU 9, @14, KC0[CB0:0-32], KC1[]
1712 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
1713 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1715 ; EG-NEXT: Fetch clause starting at 6:
1716 ; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1
1717 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1
1718 ; EG-NEXT: ALU clause starting at 10:
1719 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1720 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1721 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
1722 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
1723 ; EG-NEXT: ALU clause starting at 14:
1724 ; EG-NEXT: MIN_UINT * T0.Y, T2.Y, T1.Y,
1725 ; EG-NEXT: MIN_UINT T0.X, T2.X, T1.X,
1726 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1727 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
1728 ; EG-NEXT: MIN_UINT * T2.X, T2.Z, T1.Z,
1729 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1730 ; EG-NEXT: ADD_INT * T0.W, T0.W, literal.x,
1731 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1732 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
1733 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1735 ; CI-LABEL: v_test_umin_ule_v3i32:
1737 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1738 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
1739 ; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1740 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1741 ; CI-NEXT: v_mov_b32_e32 v1, s3
1742 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v6
1743 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1744 ; CI-NEXT: v_mov_b32_e32 v2, s5
1745 ; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v6
1746 ; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
1747 ; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
1748 ; CI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
1749 ; CI-NEXT: v_mov_b32_e32 v7, s1
1750 ; CI-NEXT: v_add_i32_e32 v6, vcc, s0, v6
1751 ; CI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
1752 ; CI-NEXT: s_waitcnt vmcnt(0)
1753 ; CI-NEXT: v_min_u32_e32 v2, v2, v5
1754 ; CI-NEXT: v_min_u32_e32 v1, v1, v4
1755 ; CI-NEXT: v_min_u32_e32 v0, v0, v3
1756 ; CI-NEXT: flat_store_dwordx3 v[6:7], v[0:2]
1759 ; VI-LABEL: v_test_umin_ule_v3i32:
1761 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1762 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1763 ; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1764 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1765 ; VI-NEXT: v_mov_b32_e32 v1, s3
1766 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6
1767 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1768 ; VI-NEXT: v_mov_b32_e32 v2, s5
1769 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v6
1770 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
1771 ; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
1772 ; VI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
1773 ; VI-NEXT: v_mov_b32_e32 v7, s1
1774 ; VI-NEXT: v_add_u32_e32 v6, vcc, s0, v6
1775 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
1776 ; VI-NEXT: s_waitcnt vmcnt(0)
1777 ; VI-NEXT: v_min_u32_e32 v2, v2, v5
1778 ; VI-NEXT: v_min_u32_e32 v1, v1, v4
1779 ; VI-NEXT: v_min_u32_e32 v0, v0, v3
1780 ; VI-NEXT: flat_store_dwordx3 v[6:7], v[0:2]
1783 ; GFX9-LABEL: v_test_umin_ule_v3i32:
1785 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1786 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1787 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1788 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1789 ; GFX9-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3]
1790 ; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5]
1791 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1792 ; GFX9-NEXT: v_min_u32_e32 v2, v2, v5
1793 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v4
1794 ; GFX9-NEXT: v_min_u32_e32 v0, v0, v3
1795 ; GFX9-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
1796 ; GFX9-NEXT: s_endpgm
1798 ; GFX10-LABEL: v_test_umin_ule_v3i32:
1800 ; GFX10-NEXT: s_clause 0x1
1801 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1802 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1803 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1804 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1805 ; GFX10-NEXT: s_clause 0x1
1806 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3]
1807 ; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5]
1808 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1809 ; GFX10-NEXT: v_min_u32_e32 v2, v2, v5
1810 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v4
1811 ; GFX10-NEXT: v_min_u32_e32 v0, v0, v3
1812 ; GFX10-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
1813 ; GFX10-NEXT: s_endpgm
1815 ; GFX11-LABEL: v_test_umin_ule_v3i32:
1817 ; GFX11-NEXT: s_clause 0x1
1818 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1819 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
1820 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1821 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1822 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1823 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1824 ; GFX11-NEXT: s_clause 0x1
1825 ; GFX11-NEXT: global_load_b96 v[0:2], v6, s[2:3]
1826 ; GFX11-NEXT: global_load_b96 v[3:5], v6, s[4:5]
1827 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1828 ; GFX11-NEXT: v_min_u32_e32 v2, v2, v5
1829 ; GFX11-NEXT: v_min_u32_e32 v1, v1, v4
1830 ; GFX11-NEXT: v_min_u32_e32 v0, v0, v3
1831 ; GFX11-NEXT: global_store_b96 v6, v[0:2], s[0:1]
1832 ; GFX11-NEXT: s_endpgm
1833 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1834 %a.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %a.ptr, i32 %tid
1835 %b.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %b.ptr, i32 %tid
1836 %out.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %out, i32 %tid
1838 %a = load <3 x i32>, ptr addrspace(1) %a.gep
1839 %b = load <3 x i32>, ptr addrspace(1) %b.gep
1840 %cmp = icmp ule <3 x i32> %a, %b
1841 %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
1842 store <3 x i32> %val, ptr addrspace(1) %out.gep
1846 ; FIXME: Reduce unused packed component to scalar
1848 define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1849 ; EG-LABEL: v_test_umin_ule_v3i16:
1851 ; EG-NEXT: ALU 3, @20, KC0[CB0:0-32], KC1[]
1853 ; EG-NEXT: ALU 11, @24, KC0[CB0:0-32], KC1[]
1854 ; EG-NEXT: TEX 3 @12
1855 ; EG-NEXT: ALU 8, @36, KC0[], KC1[]
1856 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T8.X, 0
1857 ; EG-NEXT: MEM_RAT MSKOR T7.XW, T0.X
1859 ; EG-NEXT: Fetch clause starting at 8:
1860 ; EG-NEXT: VTX_READ_16 T7.X, T6.X, 4, #1
1861 ; EG-NEXT: VTX_READ_16 T8.X, T0.X, 4, #1
1862 ; EG-NEXT: Fetch clause starting at 12:
1863 ; EG-NEXT: VTX_READ_16 T8.X, T6.X, 0, #1
1864 ; EG-NEXT: VTX_READ_16 T9.X, T0.X, 0, #1
1865 ; EG-NEXT: VTX_READ_16 T6.X, T6.X, 2, #1
1866 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1
1867 ; EG-NEXT: ALU clause starting at 20:
1868 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1869 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1870 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
1871 ; EG-NEXT: ADD_INT * T6.X, KC0[2].W, PV.W,
1872 ; EG-NEXT: ALU clause starting at 24:
1873 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1874 ; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x,
1875 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1876 ; EG-NEXT: AND_INT * T2.W, PV.W, literal.x,
1877 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1878 ; EG-NEXT: LSHL T2.W, PV.W, literal.x,
1879 ; EG-NEXT: MIN_UINT * T3.W, T8.X, T7.X,
1880 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1881 ; EG-NEXT: LSHL T7.X, PS, PV.W,
1882 ; EG-NEXT: LSHL * T7.W, literal.x, PV.W,
1883 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1884 ; EG-NEXT: MOV * T7.Y, 0.0,
1885 ; EG-NEXT: ALU clause starting at 36:
1886 ; EG-NEXT: MOV T7.Z, 0.0,
1887 ; EG-NEXT: MIN_UINT * T2.W, T0.X, T6.X,
1888 ; EG-NEXT: LSHR T0.X, T1.W, literal.x,
1889 ; EG-NEXT: LSHL T1.W, PV.W, literal.y,
1890 ; EG-NEXT: MIN_UINT * T2.W, T9.X, T8.X,
1891 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
1892 ; EG-NEXT: OR_INT T6.X, PV.W, PS,
1893 ; EG-NEXT: LSHR * T8.X, T0.W, literal.x,
1894 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1896 ; CI-LABEL: v_test_umin_ule_v3i16:
1898 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1899 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
1900 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1901 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1902 ; CI-NEXT: v_mov_b32_e32 v1, s3
1903 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
1904 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1905 ; CI-NEXT: v_mov_b32_e32 v3, s5
1906 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
1907 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1908 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1909 ; CI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1910 ; CI-NEXT: v_mov_b32_e32 v5, s1
1911 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
1912 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
1913 ; CI-NEXT: v_add_i32_e32 v6, vcc, 4, v4
1914 ; CI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
1915 ; CI-NEXT: s_waitcnt vmcnt(1)
1916 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0
1917 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1918 ; CI-NEXT: s_waitcnt vmcnt(0)
1919 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
1920 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1921 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
1922 ; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3
1923 ; CI-NEXT: v_min_u32_e32 v0, v0, v2
1924 ; CI-NEXT: v_min_u32_e32 v2, v8, v9
1925 ; CI-NEXT: v_min_u32_e32 v1, v1, v3
1926 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1927 ; CI-NEXT: v_or_b32_e32 v0, v0, v2
1928 ; CI-NEXT: flat_store_short v[6:7], v1
1929 ; CI-NEXT: flat_store_dword v[4:5], v0
1932 ; VI-LABEL: v_test_umin_ule_v3i16:
1934 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1935 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1936 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1937 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1938 ; VI-NEXT: v_mov_b32_e32 v1, s3
1939 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1940 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1941 ; VI-NEXT: v_mov_b32_e32 v3, s5
1942 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1943 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1944 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1945 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1946 ; VI-NEXT: v_mov_b32_e32 v5, s1
1947 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
1948 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
1949 ; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v4
1950 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
1951 ; VI-NEXT: s_waitcnt vmcnt(0)
1952 ; VI-NEXT: v_min_u16_e32 v8, v0, v2
1953 ; VI-NEXT: v_min_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1954 ; VI-NEXT: v_min_u16_e32 v1, v1, v3
1955 ; VI-NEXT: v_or_b32_e32 v0, v8, v0
1956 ; VI-NEXT: flat_store_short v[6:7], v1
1957 ; VI-NEXT: flat_store_dword v[4:5], v0
1960 ; GFX9-LABEL: v_test_umin_ule_v3i16:
1962 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1963 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1964 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1965 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1966 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
1967 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
1968 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1969 ; GFX9-NEXT: v_pk_min_u16 v1, v1, v3
1970 ; GFX9-NEXT: v_pk_min_u16 v0, v0, v2
1971 ; GFX9-NEXT: global_store_short v4, v1, s[0:1] offset:4
1972 ; GFX9-NEXT: global_store_dword v4, v0, s[0:1]
1973 ; GFX9-NEXT: s_endpgm
1975 ; GFX10-LABEL: v_test_umin_ule_v3i16:
1977 ; GFX10-NEXT: s_clause 0x1
1978 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1979 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1980 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1981 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1982 ; GFX10-NEXT: s_clause 0x1
1983 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
1984 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
1985 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1986 ; GFX10-NEXT: v_pk_min_u16 v1, v1, v3
1987 ; GFX10-NEXT: v_pk_min_u16 v0, v0, v2
1988 ; GFX10-NEXT: global_store_short v4, v1, s[0:1] offset:4
1989 ; GFX10-NEXT: global_store_dword v4, v0, s[0:1]
1990 ; GFX10-NEXT: s_endpgm
1992 ; GFX11-LABEL: v_test_umin_ule_v3i16:
1994 ; GFX11-NEXT: s_clause 0x1
1995 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1996 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
1997 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1998 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1999 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
2000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2001 ; GFX11-NEXT: s_clause 0x1
2002 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3]
2003 ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5]
2004 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2005 ; GFX11-NEXT: v_pk_min_u16 v1, v1, v3
2006 ; GFX11-NEXT: v_pk_min_u16 v0, v0, v2
2007 ; GFX11-NEXT: s_clause 0x1
2008 ; GFX11-NEXT: global_store_b16 v4, v1, s[0:1] offset:4
2009 ; GFX11-NEXT: global_store_b32 v4, v0, s[0:1]
2010 ; GFX11-NEXT: s_endpgm
2011 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2012 %a.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
2013 %b.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
2014 %out.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %out, i32 %tid
2016 %a = load <3 x i16>, ptr addrspace(1) %a.gep
2017 %b = load <3 x i16>, ptr addrspace(1) %b.gep
2018 %cmp = icmp ule <3 x i16> %a, %b
2019 %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
2020 store <3 x i16> %val, ptr addrspace(1) %out.gep
2024 define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
2025 ; EG-LABEL: s_test_umin_ule_i32:
2027 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
2028 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2031 ; EG-NEXT: ALU clause starting at 4:
2032 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
2033 ; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
2034 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2036 ; CI-LABEL: s_test_umin_ule_i32:
2038 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2039 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2040 ; CI-NEXT: s_min_u32 s2, s2, s3
2041 ; CI-NEXT: v_mov_b32_e32 v0, s0
2042 ; CI-NEXT: v_mov_b32_e32 v1, s1
2043 ; CI-NEXT: v_mov_b32_e32 v2, s2
2044 ; CI-NEXT: flat_store_dword v[0:1], v2
2047 ; VI-LABEL: s_test_umin_ule_i32:
2049 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2050 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2051 ; VI-NEXT: s_min_u32 s2, s2, s3
2052 ; VI-NEXT: v_mov_b32_e32 v0, s0
2053 ; VI-NEXT: v_mov_b32_e32 v1, s1
2054 ; VI-NEXT: v_mov_b32_e32 v2, s2
2055 ; VI-NEXT: flat_store_dword v[0:1], v2
2058 ; GFX9-LABEL: s_test_umin_ule_i32:
2060 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2061 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2062 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2063 ; GFX9-NEXT: s_min_u32 s2, s2, s3
2064 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2065 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2066 ; GFX9-NEXT: s_endpgm
2068 ; GFX10-LABEL: s_test_umin_ule_i32:
2070 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2071 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2072 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2073 ; GFX10-NEXT: s_min_u32 s2, s2, s3
2074 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
2075 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2076 ; GFX10-NEXT: s_endpgm
2078 ; GFX11-LABEL: s_test_umin_ule_i32:
2080 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
2081 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2082 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2083 ; GFX11-NEXT: s_min_u32 s2, s2, s3
2084 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2085 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
2086 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2087 ; GFX11-NEXT: s_endpgm
2088 %cmp = icmp ule i32 %a, %b
2089 %val = select i1 %cmp, i32 %a, i32 %b
2090 store i32 %val, ptr addrspace(1) %out, align 4
2094 define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
2095 ; EG-LABEL: v_test_umin_ult_i32:
2097 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
2099 ; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
2100 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2103 ; EG-NEXT: Fetch clause starting at 6:
2104 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
2105 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
2106 ; EG-NEXT: ALU clause starting at 10:
2107 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
2108 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2109 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
2110 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
2111 ; EG-NEXT: ALU clause starting at 14:
2112 ; EG-NEXT: MIN_UINT T0.X, T0.X, T1.X,
2113 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
2114 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
2115 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2117 ; CI-LABEL: v_test_umin_ult_i32:
2119 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2120 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
2121 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
2122 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2123 ; CI-NEXT: v_mov_b32_e32 v1, s3
2124 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
2125 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2126 ; CI-NEXT: v_mov_b32_e32 v3, s5
2127 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
2128 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2129 ; CI-NEXT: flat_load_dword v5, v[0:1]
2130 ; CI-NEXT: flat_load_dword v2, v[2:3]
2131 ; CI-NEXT: v_mov_b32_e32 v1, s1
2132 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
2133 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2134 ; CI-NEXT: s_waitcnt vmcnt(0)
2135 ; CI-NEXT: v_min_u32_e32 v2, v5, v2
2136 ; CI-NEXT: flat_store_dword v[0:1], v2
2139 ; VI-LABEL: v_test_umin_ult_i32:
2141 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2142 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
2143 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
2144 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2145 ; VI-NEXT: v_mov_b32_e32 v1, s3
2146 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
2147 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2148 ; VI-NEXT: v_mov_b32_e32 v3, s5
2149 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
2150 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2151 ; VI-NEXT: flat_load_dword v5, v[0:1]
2152 ; VI-NEXT: flat_load_dword v2, v[2:3]
2153 ; VI-NEXT: v_mov_b32_e32 v1, s1
2154 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
2155 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2156 ; VI-NEXT: s_waitcnt vmcnt(0)
2157 ; VI-NEXT: v_min_u32_e32 v2, v5, v2
2158 ; VI-NEXT: flat_store_dword v[0:1], v2
2161 ; GFX9-LABEL: v_test_umin_ult_i32:
2163 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2164 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
2165 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2166 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2167 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2168 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
2169 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2170 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2
2171 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2172 ; GFX9-NEXT: s_endpgm
2174 ; GFX10-LABEL: v_test_umin_ult_i32:
2176 ; GFX10-NEXT: s_clause 0x1
2177 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2178 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
2179 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2180 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2181 ; GFX10-NEXT: s_clause 0x1
2182 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2183 ; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
2184 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2185 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2
2186 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2187 ; GFX10-NEXT: s_endpgm
2189 ; GFX11-LABEL: v_test_umin_ult_i32:
2191 ; GFX11-NEXT: s_clause 0x1
2192 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
2193 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
2194 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2195 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2196 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2197 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2198 ; GFX11-NEXT: s_clause 0x1
2199 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2200 ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
2201 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2202 ; GFX11-NEXT: v_min_u32_e32 v1, v1, v2
2203 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2204 ; GFX11-NEXT: s_endpgm
2205 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2206 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
2207 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
2208 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
2209 %a = load i32, ptr addrspace(1) %a.gep, align 4
2210 %b = load i32, ptr addrspace(1) %b.gep, align 4
2211 %cmp = icmp ult i32 %a, %b
2212 %val = select i1 %cmp, i32 %a, i32 %b
2213 store i32 %val, ptr addrspace(1) %out.gep, align 4
2217 define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
2218 ; EG-LABEL: v_test_umin_ult_i8:
2220 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
2222 ; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[]
2223 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X
2226 ; EG-NEXT: Fetch clause starting at 6:
2227 ; EG-NEXT: VTX_READ_8 T2.X, T2.X, 0, #1
2228 ; EG-NEXT: VTX_READ_8 T1.X, T1.X, 0, #1
2229 ; EG-NEXT: ALU clause starting at 10:
2230 ; EG-NEXT: ADD_INT T1.X, KC0[2].Z, T0.X,
2231 ; EG-NEXT: ADD_INT * T2.X, KC0[2].W, T0.X,
2232 ; EG-NEXT: ALU clause starting at 12:
2233 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.X,
2234 ; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
2235 ; EG-NEXT: MIN_UINT * T2.W, T1.X, T2.X,
2236 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2237 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
2238 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2239 ; EG-NEXT: LSHL T1.X, T2.W, PV.W,
2240 ; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
2241 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2242 ; EG-NEXT: MOV T1.Y, 0.0,
2243 ; EG-NEXT: MOV * T1.Z, 0.0,
2244 ; EG-NEXT: LSHR * T0.X, T0.W, literal.x,
2245 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2247 ; CI-LABEL: v_test_umin_ult_i8:
2249 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2250 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
2251 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2252 ; CI-NEXT: v_mov_b32_e32 v2, s3
2253 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
2254 ; CI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
2255 ; CI-NEXT: v_mov_b32_e32 v4, s5
2256 ; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v0
2257 ; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
2258 ; CI-NEXT: flat_load_ubyte v2, v[1:2]
2259 ; CI-NEXT: flat_load_ubyte v3, v[3:4]
2260 ; CI-NEXT: v_mov_b32_e32 v1, s1
2261 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
2262 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2263 ; CI-NEXT: s_waitcnt vmcnt(0)
2264 ; CI-NEXT: v_min_u32_e32 v2, v2, v3
2265 ; CI-NEXT: flat_store_byte v[0:1], v2
2268 ; VI-LABEL: v_test_umin_ult_i8:
2270 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2271 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
2272 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2273 ; VI-NEXT: v_mov_b32_e32 v2, s3
2274 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0
2275 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
2276 ; VI-NEXT: v_mov_b32_e32 v4, s5
2277 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v0
2278 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
2279 ; VI-NEXT: flat_load_ubyte v2, v[1:2]
2280 ; VI-NEXT: flat_load_ubyte v3, v[3:4]
2281 ; VI-NEXT: v_mov_b32_e32 v1, s1
2282 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2283 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2284 ; VI-NEXT: s_waitcnt vmcnt(0)
2285 ; VI-NEXT: v_min_u16_e32 v2, v2, v3
2286 ; VI-NEXT: flat_store_byte v[0:1], v2
2289 ; GFX9-LABEL: v_test_umin_ult_i8:
2291 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2292 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
2293 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2294 ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
2295 ; GFX9-NEXT: global_load_ubyte v2, v0, s[4:5]
2296 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2297 ; GFX9-NEXT: v_min_u16_e32 v1, v1, v2
2298 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
2299 ; GFX9-NEXT: s_endpgm
2301 ; GFX10-LABEL: v_test_umin_ult_i8:
2303 ; GFX10-NEXT: s_clause 0x1
2304 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2305 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
2306 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2307 ; GFX10-NEXT: s_clause 0x1
2308 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
2309 ; GFX10-NEXT: global_load_ubyte v2, v0, s[4:5]
2310 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2311 ; GFX10-NEXT: v_min_u16 v1, v1, v2
2312 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
2313 ; GFX10-NEXT: s_endpgm
2315 ; GFX11-LABEL: v_test_umin_ult_i8:
2317 ; GFX11-NEXT: s_clause 0x1
2318 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
2319 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
2320 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2321 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2322 ; GFX11-NEXT: s_clause 0x1
2323 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3]
2324 ; GFX11-NEXT: global_load_u8 v2, v0, s[4:5]
2325 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2326 ; GFX11-NEXT: v_min_u16 v1, v1, v2
2327 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1]
2328 ; GFX11-NEXT: s_endpgm
2329 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2330 %a.gep = getelementptr inbounds i8, ptr addrspace(1) %a.ptr, i32 %tid
2331 %b.gep = getelementptr inbounds i8, ptr addrspace(1) %b.ptr, i32 %tid
2332 %out.gep = getelementptr inbounds i8, ptr addrspace(1) %out, i32 %tid
2334 %a = load i8, ptr addrspace(1) %a.gep, align 1
2335 %b = load i8, ptr addrspace(1) %b.gep, align 1
2336 %cmp = icmp ult i8 %a, %b
2337 %val = select i1 %cmp, i8 %a, i8 %b
2338 store i8 %val, ptr addrspace(1) %out.gep, align 1
2342 define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
2343 ; EG-LABEL: s_test_umin_ult_i32:
2345 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
2346 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2349 ; EG-NEXT: ALU clause starting at 4:
2350 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
2351 ; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
2352 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2354 ; CI-LABEL: s_test_umin_ult_i32:
2356 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2357 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2358 ; CI-NEXT: s_min_u32 s2, s2, s3
2359 ; CI-NEXT: v_mov_b32_e32 v0, s0
2360 ; CI-NEXT: v_mov_b32_e32 v1, s1
2361 ; CI-NEXT: v_mov_b32_e32 v2, s2
2362 ; CI-NEXT: flat_store_dword v[0:1], v2
2365 ; VI-LABEL: s_test_umin_ult_i32:
2367 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2368 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2369 ; VI-NEXT: s_min_u32 s2, s2, s3
2370 ; VI-NEXT: v_mov_b32_e32 v0, s0
2371 ; VI-NEXT: v_mov_b32_e32 v1, s1
2372 ; VI-NEXT: v_mov_b32_e32 v2, s2
2373 ; VI-NEXT: flat_store_dword v[0:1], v2
2376 ; GFX9-LABEL: s_test_umin_ult_i32:
2378 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2379 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2380 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2381 ; GFX9-NEXT: s_min_u32 s2, s2, s3
2382 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2383 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2384 ; GFX9-NEXT: s_endpgm
2386 ; GFX10-LABEL: s_test_umin_ult_i32:
2388 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2389 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2390 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2391 ; GFX10-NEXT: s_min_u32 s2, s2, s3
2392 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
2393 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2394 ; GFX10-NEXT: s_endpgm
2396 ; GFX11-LABEL: s_test_umin_ult_i32:
2398 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
2399 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2400 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2401 ; GFX11-NEXT: s_min_u32 s2, s2, s3
2402 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2403 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
2404 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2405 ; GFX11-NEXT: s_endpgm
2406 %cmp = icmp ult i32 %a, %b
2407 %val = select i1 %cmp, i32 %a, i32 %b
2408 store i32 %val, ptr addrspace(1) %out, align 4
2412 define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
2413 ; EG-LABEL: v_test_umin_ult_i32_multi_use:
2415 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
2417 ; EG-NEXT: ALU 16, @12, KC0[CB0:0-32], KC1[]
2418 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 0
2419 ; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
2421 ; EG-NEXT: Fetch clause starting at 6:
2422 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
2423 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
2424 ; EG-NEXT: ALU clause starting at 10:
2425 ; EG-NEXT: MOV T0.X, KC0[2].W,
2426 ; EG-NEXT: MOV * T1.X, KC0[3].X,
2427 ; EG-NEXT: ALU clause starting at 12:
2428 ; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x,
2429 ; EG-NEXT: SETGT_UINT * T1.W, T1.X, T0.X,
2430 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2431 ; EG-NEXT: AND_INT T1.W, PS, 1,
2432 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2433 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2434 ; EG-NEXT: LSHL T2.X, PV.W, PS,
2435 ; EG-NEXT: LSHL * T2.W, literal.x, PS,
2436 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2437 ; EG-NEXT: MOV T2.Y, 0.0,
2438 ; EG-NEXT: MOV * T2.Z, 0.0,
2439 ; EG-NEXT: LSHR T3.X, KC0[2].Z, literal.x,
2440 ; EG-NEXT: SETGE_UINT * T0.W, T0.X, T1.X,
2441 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2442 ; EG-NEXT: CNDE_INT T0.X, PV.W, T0.X, T1.X,
2443 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2444 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2446 ; CI-LABEL: v_test_umin_ult_i32_multi_use:
2448 ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
2449 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2450 ; CI-NEXT: s_load_dword s4, s[4:5], 0x0
2451 ; CI-NEXT: s_load_dword s5, s[6:7], 0x0
2452 ; CI-NEXT: v_mov_b32_e32 v0, s0
2453 ; CI-NEXT: v_mov_b32_e32 v1, s1
2454 ; CI-NEXT: v_mov_b32_e32 v2, s2
2455 ; CI-NEXT: v_mov_b32_e32 v3, s3
2456 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2457 ; CI-NEXT: s_cmp_lt_u32 s4, s5
2458 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
2459 ; CI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
2460 ; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
2461 ; CI-NEXT: s_cselect_b32 s0, s4, s5
2462 ; CI-NEXT: v_mov_b32_e32 v5, s0
2463 ; CI-NEXT: flat_store_dword v[0:1], v5
2464 ; CI-NEXT: flat_store_byte v[2:3], v4
2467 ; VI-LABEL: v_test_umin_ult_i32_multi_use:
2469 ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
2470 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2471 ; VI-NEXT: s_load_dword s4, s[4:5], 0x0
2472 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0
2473 ; VI-NEXT: v_mov_b32_e32 v0, s0
2474 ; VI-NEXT: v_mov_b32_e32 v1, s1
2475 ; VI-NEXT: v_mov_b32_e32 v2, s2
2476 ; VI-NEXT: v_mov_b32_e32 v3, s3
2477 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2478 ; VI-NEXT: s_cmp_lt_u32 s4, s5
2479 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
2480 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
2481 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
2482 ; VI-NEXT: s_cselect_b32 s0, s4, s5
2483 ; VI-NEXT: v_mov_b32_e32 v5, s0
2484 ; VI-NEXT: flat_store_dword v[0:1], v5
2485 ; VI-NEXT: flat_store_byte v[2:3], v4
2488 ; GFX9-LABEL: v_test_umin_ult_i32_multi_use:
2490 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
2491 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2492 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2493 ; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0
2494 ; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0
2495 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2496 ; GFX9-NEXT: s_cmp_lt_u32 s8, s9
2497 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
2498 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
2499 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
2500 ; GFX9-NEXT: s_cselect_b32 s4, s8, s9
2501 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
2502 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
2503 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
2504 ; GFX9-NEXT: s_endpgm
2506 ; GFX10-LABEL: v_test_umin_ult_i32_multi_use:
2508 ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
2509 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
2510 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2511 ; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0
2512 ; GFX10-NEXT: s_load_dword s9, s[6:7], 0x0
2513 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2514 ; GFX10-NEXT: s_cmp_lt_u32 s8, s9
2515 ; GFX10-NEXT: s_cselect_b32 s4, -1, 0
2516 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
2517 ; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
2518 ; GFX10-NEXT: s_cselect_b32 s4, s8, s9
2519 ; GFX10-NEXT: v_mov_b32_e32 v2, s4
2520 ; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
2521 ; GFX10-NEXT: global_store_byte v1, v0, s[2:3]
2522 ; GFX10-NEXT: s_endpgm
2524 ; GFX11-LABEL: v_test_umin_ult_i32_multi_use:
2526 ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x0
2527 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
2528 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2529 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
2530 ; GFX11-NEXT: s_load_b32 s5, s[6:7], 0x0
2531 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2532 ; GFX11-NEXT: s_cmp_lt_u32 s4, s5
2533 ; GFX11-NEXT: s_cselect_b32 s6, -1, 0
2534 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
2535 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
2536 ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
2537 ; GFX11-NEXT: s_cselect_b32 s4, s4, s5
2538 ; GFX11-NEXT: v_mov_b32_e32 v2, s4
2539 ; GFX11-NEXT: s_clause 0x1
2540 ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
2541 ; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
2542 ; GFX11-NEXT: s_endpgm
2543 %a = load i32, ptr addrspace(1) %aptr, align 4
2544 %b = load i32, ptr addrspace(1) %bptr, align 4
2545 %cmp = icmp ult i32 %a, %b
2546 %val = select i1 %cmp, i32 %a, i32 %b
2547 store i32 %val, ptr addrspace(1) %out0, align 4
2548 store i1 %cmp, ptr addrspace(1) %out1
2552 define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
2553 ; EG-LABEL: v_test_umin_ult_i16_multi_use:
2555 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
2557 ; EG-NEXT: ALU 24, @12, KC0[CB0:0-32], KC1[]
2558 ; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
2559 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
2561 ; EG-NEXT: Fetch clause starting at 6:
2562 ; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
2563 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
2564 ; EG-NEXT: ALU clause starting at 10:
2565 ; EG-NEXT: MOV T0.X, KC0[2].W,
2566 ; EG-NEXT: MOV * T1.X, KC0[3].X,
2567 ; EG-NEXT: ALU clause starting at 12:
2568 ; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
2569 ; EG-NEXT: SETGE_UINT * T1.W, T0.X, T1.X,
2570 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2571 ; EG-NEXT: CNDE_INT T1.W, PS, T0.X, T1.X,
2572 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2573 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2574 ; EG-NEXT: LSHL T2.X, PV.W, PS,
2575 ; EG-NEXT: LSHL * T2.W, literal.x, PS,
2576 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2577 ; EG-NEXT: MOV T2.Y, 0.0,
2578 ; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x,
2579 ; EG-NEXT: SETGT_UINT * T1.W, T1.X, T0.X,
2580 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2581 ; EG-NEXT: AND_INT T1.W, PS, 1,
2582 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2583 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2584 ; EG-NEXT: LSHL T0.X, PV.W, PS,
2585 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
2586 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2587 ; EG-NEXT: MOV T0.Y, 0.0,
2588 ; EG-NEXT: MOV T2.Z, 0.0,
2589 ; EG-NEXT: MOV * T0.Z, 0.0,
2590 ; EG-NEXT: LSHR T1.X, KC0[2].Z, literal.x,
2591 ; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
2592 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2594 ; CI-LABEL: v_test_umin_ult_i16_multi_use:
2596 ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
2597 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2598 ; CI-NEXT: v_mov_b32_e32 v0, s4
2599 ; CI-NEXT: v_mov_b32_e32 v1, s5
2600 ; CI-NEXT: v_mov_b32_e32 v2, s6
2601 ; CI-NEXT: v_mov_b32_e32 v3, s7
2602 ; CI-NEXT: flat_load_ushort v4, v[0:1]
2603 ; CI-NEXT: flat_load_ushort v5, v[2:3]
2604 ; CI-NEXT: v_mov_b32_e32 v0, s0
2605 ; CI-NEXT: v_mov_b32_e32 v1, s1
2606 ; CI-NEXT: v_mov_b32_e32 v2, s2
2607 ; CI-NEXT: v_mov_b32_e32 v3, s3
2608 ; CI-NEXT: s_waitcnt vmcnt(0)
2609 ; CI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5
2610 ; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
2611 ; CI-NEXT: flat_store_short v[0:1], v4
2612 ; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
2613 ; CI-NEXT: flat_store_byte v[2:3], v0
2616 ; VI-LABEL: v_test_umin_ult_i16_multi_use:
2618 ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
2619 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2620 ; VI-NEXT: v_mov_b32_e32 v0, s4
2621 ; VI-NEXT: v_mov_b32_e32 v1, s5
2622 ; VI-NEXT: v_mov_b32_e32 v2, s6
2623 ; VI-NEXT: v_mov_b32_e32 v3, s7
2624 ; VI-NEXT: flat_load_ushort v4, v[0:1]
2625 ; VI-NEXT: flat_load_ushort v5, v[2:3]
2626 ; VI-NEXT: v_mov_b32_e32 v0, s0
2627 ; VI-NEXT: v_mov_b32_e32 v1, s1
2628 ; VI-NEXT: v_mov_b32_e32 v2, s2
2629 ; VI-NEXT: v_mov_b32_e32 v3, s3
2630 ; VI-NEXT: s_waitcnt vmcnt(0)
2631 ; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5
2632 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
2633 ; VI-NEXT: flat_store_short v[0:1], v4
2634 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
2635 ; VI-NEXT: flat_store_byte v[2:3], v0
2638 ; GFX9-LABEL: v_test_umin_ult_i16_multi_use:
2640 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
2641 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2642 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2643 ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5]
2644 ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
2645 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2646 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
2647 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
2648 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
2649 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
2650 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
2651 ; GFX9-NEXT: s_endpgm
2653 ; GFX10-LABEL: v_test_umin_ult_i16_multi_use:
2655 ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
2656 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2657 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2658 ; GFX10-NEXT: s_clause 0x1
2659 ; GFX10-NEXT: global_load_ushort v1, v0, s[4:5]
2660 ; GFX10-NEXT: global_load_ushort v2, v0, s[6:7]
2661 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2662 ; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
2663 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
2664 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
2665 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
2666 ; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
2667 ; GFX10-NEXT: s_endpgm
2669 ; GFX11-LABEL: v_test_umin_ult_i16_multi_use:
2671 ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x0
2672 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2673 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2674 ; GFX11-NEXT: s_clause 0x1
2675 ; GFX11-NEXT: global_load_u16 v1, v0, s[4:5]
2676 ; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
2677 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2678 ; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
2679 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
2680 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
2681 ; GFX11-NEXT: s_clause 0x1
2682 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
2683 ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
2684 ; GFX11-NEXT: s_endpgm
2685 %a = load i16, ptr addrspace(1) %aptr, align 2
2686 %b = load i16, ptr addrspace(1) %bptr, align 2
2687 %cmp = icmp ult i16 %a, %b
2688 %val = select i1 %cmp, i16 %a, i16 %b
2689 store i16 %val, ptr addrspace(1) %out0, align 2
2690 store i1 %cmp, ptr addrspace(1) %out1
2694 define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
2695 ; EG-LABEL: s_test_umin_ult_v1i32:
2697 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
2698 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2701 ; EG-NEXT: ALU clause starting at 4:
2702 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
2703 ; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
2704 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2706 ; CI-LABEL: s_test_umin_ult_v1i32:
2708 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2709 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2710 ; CI-NEXT: s_min_u32 s2, s2, s3
2711 ; CI-NEXT: v_mov_b32_e32 v0, s0
2712 ; CI-NEXT: v_mov_b32_e32 v1, s1
2713 ; CI-NEXT: v_mov_b32_e32 v2, s2
2714 ; CI-NEXT: flat_store_dword v[0:1], v2
2717 ; VI-LABEL: s_test_umin_ult_v1i32:
2719 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2720 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2721 ; VI-NEXT: s_min_u32 s2, s2, s3
2722 ; VI-NEXT: v_mov_b32_e32 v0, s0
2723 ; VI-NEXT: v_mov_b32_e32 v1, s1
2724 ; VI-NEXT: v_mov_b32_e32 v2, s2
2725 ; VI-NEXT: flat_store_dword v[0:1], v2
2728 ; GFX9-LABEL: s_test_umin_ult_v1i32:
2730 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2731 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2732 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2733 ; GFX9-NEXT: s_min_u32 s2, s2, s3
2734 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2735 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2736 ; GFX9-NEXT: s_endpgm
2738 ; GFX10-LABEL: s_test_umin_ult_v1i32:
2740 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2741 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2742 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2743 ; GFX10-NEXT: s_min_u32 s2, s2, s3
2744 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
2745 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2746 ; GFX10-NEXT: s_endpgm
2748 ; GFX11-LABEL: s_test_umin_ult_v1i32:
2750 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
2751 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2752 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2753 ; GFX11-NEXT: s_min_u32 s2, s2, s3
2754 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2755 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
2756 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2757 ; GFX11-NEXT: s_endpgm
2758 %cmp = icmp ult <1 x i32> %a, %b
2759 %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
2760 store <1 x i32> %val, ptr addrspace(1) %out
2764 define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) #0 {
2765 ; EG-LABEL: s_test_umin_ult_v8i32:
2767 ; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
2768 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
2769 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2771 ; EG-NEXT: ALU clause starting at 4:
2772 ; EG-NEXT: MIN_UINT * T0.W, KC0[5].X, KC0[7].X,
2773 ; EG-NEXT: MIN_UINT * T0.Z, KC0[4].W, KC0[6].W,
2774 ; EG-NEXT: MIN_UINT * T0.Y, KC0[4].Z, KC0[6].Z,
2775 ; EG-NEXT: MIN_UINT * T0.X, KC0[4].Y, KC0[6].Y,
2776 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2777 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2778 ; EG-NEXT: MIN_UINT * T2.W, KC0[6].X, KC0[8].X,
2779 ; EG-NEXT: MIN_UINT * T2.Z, KC0[5].W, KC0[7].W,
2780 ; EG-NEXT: MIN_UINT * T2.Y, KC0[5].Z, KC0[7].Z,
2781 ; EG-NEXT: MIN_UINT * T2.X, KC0[5].Y, KC0[7].Y,
2782 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
2783 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2784 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
2785 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2787 ; CI-LABEL: s_test_umin_ult_v8i32:
2789 ; CI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8
2790 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
2791 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2792 ; CI-NEXT: s_min_u32 s4, s15, s23
2793 ; CI-NEXT: s_min_u32 s5, s14, s22
2794 ; CI-NEXT: s_min_u32 s6, s13, s21
2795 ; CI-NEXT: s_min_u32 s7, s12, s20
2796 ; CI-NEXT: s_min_u32 s2, s19, s27
2797 ; CI-NEXT: s_min_u32 s3, s18, s26
2798 ; CI-NEXT: s_min_u32 s8, s17, s25
2799 ; CI-NEXT: s_min_u32 s9, s16, s24
2800 ; CI-NEXT: v_mov_b32_e32 v3, s2
2801 ; CI-NEXT: s_add_u32 s2, s0, 16
2802 ; CI-NEXT: v_mov_b32_e32 v2, s3
2803 ; CI-NEXT: s_addc_u32 s3, s1, 0
2804 ; CI-NEXT: v_mov_b32_e32 v5, s3
2805 ; CI-NEXT: v_mov_b32_e32 v0, s9
2806 ; CI-NEXT: v_mov_b32_e32 v1, s8
2807 ; CI-NEXT: v_mov_b32_e32 v4, s2
2808 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2809 ; CI-NEXT: v_mov_b32_e32 v5, s1
2810 ; CI-NEXT: v_mov_b32_e32 v0, s7
2811 ; CI-NEXT: v_mov_b32_e32 v1, s6
2812 ; CI-NEXT: v_mov_b32_e32 v2, s5
2813 ; CI-NEXT: v_mov_b32_e32 v3, s4
2814 ; CI-NEXT: v_mov_b32_e32 v4, s0
2815 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2818 ; VI-LABEL: s_test_umin_ult_v8i32:
2820 ; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20
2821 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
2822 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2823 ; VI-NEXT: s_min_u32 s4, s15, s23
2824 ; VI-NEXT: s_min_u32 s5, s14, s22
2825 ; VI-NEXT: s_min_u32 s6, s13, s21
2826 ; VI-NEXT: s_min_u32 s7, s12, s20
2827 ; VI-NEXT: s_min_u32 s2, s19, s27
2828 ; VI-NEXT: s_min_u32 s3, s18, s26
2829 ; VI-NEXT: s_min_u32 s8, s17, s25
2830 ; VI-NEXT: s_min_u32 s9, s16, s24
2831 ; VI-NEXT: v_mov_b32_e32 v3, s2
2832 ; VI-NEXT: s_add_u32 s2, s0, 16
2833 ; VI-NEXT: v_mov_b32_e32 v2, s3
2834 ; VI-NEXT: s_addc_u32 s3, s1, 0
2835 ; VI-NEXT: v_mov_b32_e32 v5, s3
2836 ; VI-NEXT: v_mov_b32_e32 v0, s9
2837 ; VI-NEXT: v_mov_b32_e32 v1, s8
2838 ; VI-NEXT: v_mov_b32_e32 v4, s2
2839 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2840 ; VI-NEXT: v_mov_b32_e32 v5, s1
2841 ; VI-NEXT: v_mov_b32_e32 v0, s7
2842 ; VI-NEXT: v_mov_b32_e32 v1, s6
2843 ; VI-NEXT: v_mov_b32_e32 v2, s5
2844 ; VI-NEXT: v_mov_b32_e32 v3, s4
2845 ; VI-NEXT: v_mov_b32_e32 v4, s0
2846 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2849 ; GFX9-LABEL: s_test_umin_ult_v8i32:
2851 ; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20
2852 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
2853 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2854 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2855 ; GFX9-NEXT: s_min_u32 s6, s19, s27
2856 ; GFX9-NEXT: s_min_u32 s7, s18, s26
2857 ; GFX9-NEXT: s_min_u32 s8, s17, s25
2858 ; GFX9-NEXT: s_min_u32 s9, s16, s24
2859 ; GFX9-NEXT: s_min_u32 s2, s15, s23
2860 ; GFX9-NEXT: s_min_u32 s3, s14, s22
2861 ; GFX9-NEXT: s_min_u32 s4, s13, s21
2862 ; GFX9-NEXT: s_min_u32 s5, s12, s20
2863 ; GFX9-NEXT: v_mov_b32_e32 v0, s9
2864 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
2865 ; GFX9-NEXT: v_mov_b32_e32 v2, s7
2866 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
2867 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
2868 ; GFX9-NEXT: s_nop 0
2869 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
2870 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
2871 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
2872 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
2873 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
2874 ; GFX9-NEXT: s_endpgm
2876 ; GFX10-LABEL: s_test_umin_ult_v8i32:
2878 ; GFX10-NEXT: s_clause 0x1
2879 ; GFX10-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20
2880 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
2881 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
2882 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2883 ; GFX10-NEXT: s_min_u32 s6, s19, s27
2884 ; GFX10-NEXT: s_min_u32 s7, s18, s26
2885 ; GFX10-NEXT: s_min_u32 s8, s16, s24
2886 ; GFX10-NEXT: s_min_u32 s9, s17, s25
2887 ; GFX10-NEXT: s_min_u32 s2, s15, s23
2888 ; GFX10-NEXT: s_min_u32 s3, s14, s22
2889 ; GFX10-NEXT: s_min_u32 s4, s13, s21
2890 ; GFX10-NEXT: s_min_u32 s5, s12, s20
2891 ; GFX10-NEXT: v_mov_b32_e32 v0, s8
2892 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
2893 ; GFX10-NEXT: v_mov_b32_e32 v2, s7
2894 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
2895 ; GFX10-NEXT: v_mov_b32_e32 v4, s5
2896 ; GFX10-NEXT: v_mov_b32_e32 v5, s4
2897 ; GFX10-NEXT: v_mov_b32_e32 v6, s3
2898 ; GFX10-NEXT: v_mov_b32_e32 v7, s2
2899 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
2900 ; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
2901 ; GFX10-NEXT: s_endpgm
2903 ; GFX11-LABEL: s_test_umin_ult_v8i32:
2905 ; GFX11-NEXT: s_clause 0x1
2906 ; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x20
2907 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2908 ; GFX11-NEXT: v_mov_b32_e32 v8, 0
2909 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2910 ; GFX11-NEXT: s_min_u32 s4, s9, s17
2911 ; GFX11-NEXT: s_min_u32 s5, s8, s16
2912 ; GFX11-NEXT: s_min_u32 s6, s15, s23
2913 ; GFX11-NEXT: s_min_u32 s7, s14, s22
2914 ; GFX11-NEXT: s_min_u32 s8, s12, s20
2915 ; GFX11-NEXT: s_min_u32 s9, s13, s21
2916 ; GFX11-NEXT: s_min_u32 s2, s11, s19
2917 ; GFX11-NEXT: s_min_u32 s3, s10, s18
2918 ; GFX11-NEXT: v_mov_b32_e32 v0, s8
2919 ; GFX11-NEXT: v_mov_b32_e32 v1, s9
2920 ; GFX11-NEXT: v_mov_b32_e32 v2, s7
2921 ; GFX11-NEXT: v_mov_b32_e32 v3, s6
2922 ; GFX11-NEXT: v_mov_b32_e32 v4, s5
2923 ; GFX11-NEXT: v_mov_b32_e32 v5, s4
2924 ; GFX11-NEXT: v_mov_b32_e32 v6, s3
2925 ; GFX11-NEXT: v_mov_b32_e32 v7, s2
2926 ; GFX11-NEXT: s_clause 0x1
2927 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
2928 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
2929 ; GFX11-NEXT: s_endpgm
2930 %cmp = icmp ult <8 x i32> %a, %b
2931 %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
2932 store <8 x i32> %val, ptr addrspace(1) %out
2936 define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16> %a, <8 x i16> %b) #0 {
2937 ; EG-LABEL: s_test_umin_ult_v8i16:
2939 ; EG-NEXT: ALU 1, @52, KC0[], KC1[]
2940 ; EG-NEXT: TEX 1 @20
2941 ; EG-NEXT: ALU 9, @54, KC0[], KC1[]
2942 ; EG-NEXT: TEX 1 @24
2943 ; EG-NEXT: ALU 8, @64, KC0[], KC1[]
2944 ; EG-NEXT: TEX 1 @28
2945 ; EG-NEXT: ALU 10, @73, KC0[], KC1[]
2946 ; EG-NEXT: TEX 1 @32
2947 ; EG-NEXT: ALU 8, @84, KC0[], KC1[]
2948 ; EG-NEXT: TEX 1 @36
2949 ; EG-NEXT: ALU 10, @93, KC0[], KC1[]
2950 ; EG-NEXT: TEX 1 @40
2951 ; EG-NEXT: ALU 8, @104, KC0[], KC1[]
2952 ; EG-NEXT: TEX 1 @44
2953 ; EG-NEXT: ALU 10, @113, KC0[], KC1[]
2954 ; EG-NEXT: TEX 1 @48
2955 ; EG-NEXT: ALU 10, @124, KC0[CB0:0-32], KC1[]
2956 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
2959 ; EG-NEXT: Fetch clause starting at 20:
2960 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3
2961 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 82, #3
2962 ; EG-NEXT: Fetch clause starting at 24:
2963 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3
2964 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 80, #3
2965 ; EG-NEXT: Fetch clause starting at 28:
2966 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3
2967 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 78, #3
2968 ; EG-NEXT: Fetch clause starting at 32:
2969 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3
2970 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 76, #3
2971 ; EG-NEXT: Fetch clause starting at 36:
2972 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3
2973 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 74, #3
2974 ; EG-NEXT: Fetch clause starting at 40:
2975 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3
2976 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 72, #3
2977 ; EG-NEXT: Fetch clause starting at 44:
2978 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3
2979 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 70, #3
2980 ; EG-NEXT: Fetch clause starting at 48:
2981 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 52, #3
2982 ; EG-NEXT: VTX_READ_16 T7.X, T7.X, 68, #3
2983 ; EG-NEXT: ALU clause starting at 52:
2984 ; EG-NEXT: MOV * T0.Y, T3.X,
2985 ; EG-NEXT: MOV * T7.X, 0.0,
2986 ; EG-NEXT: ALU clause starting at 54:
2987 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
2988 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
2989 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2990 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
2991 ; EG-NEXT: LSHL T0.W, PV.W, literal.x,
2992 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
2993 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
2994 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
2995 ; EG-NEXT: MOV * T3.X, PV.W,
2996 ; EG-NEXT: MOV * T0.Y, PV.X,
2997 ; EG-NEXT: ALU clause starting at 64:
2998 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
2999 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3000 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3001 ; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
3002 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3003 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
3004 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3005 ; EG-NEXT: MOV T3.X, PV.W,
3006 ; EG-NEXT: MOV * T0.Y, T2.X,
3007 ; EG-NEXT: ALU clause starting at 73:
3008 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3009 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3010 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3011 ; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
3012 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
3013 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3014 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3015 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3016 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3017 ; EG-NEXT: MOV * T2.X, PV.W,
3018 ; EG-NEXT: MOV * T0.Y, PV.X,
3019 ; EG-NEXT: ALU clause starting at 84:
3020 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3021 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3022 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3023 ; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
3024 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3025 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
3026 ; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
3027 ; EG-NEXT: MOV T2.X, PV.Z,
3028 ; EG-NEXT: MOV * T0.Y, T5.X,
3029 ; EG-NEXT: ALU clause starting at 93:
3030 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3031 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3032 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3033 ; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
3034 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
3035 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3036 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3037 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3038 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3039 ; EG-NEXT: MOV * T5.X, PV.W,
3040 ; EG-NEXT: MOV * T0.Y, PV.X,
3041 ; EG-NEXT: ALU clause starting at 104:
3042 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3043 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3044 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3045 ; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
3046 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3047 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
3048 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3049 ; EG-NEXT: MOV T5.X, PV.W,
3050 ; EG-NEXT: MOV * T0.Y, T4.X,
3051 ; EG-NEXT: ALU clause starting at 113:
3052 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3053 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3054 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3055 ; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
3056 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
3057 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3058 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3059 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3060 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3061 ; EG-NEXT: MOV * T4.X, PV.W,
3062 ; EG-NEXT: MOV * T0.Y, PV.X,
3063 ; EG-NEXT: ALU clause starting at 124:
3064 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3065 ; EG-NEXT: AND_INT * T1.W, T7.X, literal.x,
3066 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3067 ; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
3068 ; EG-NEXT: AND_INT T2.W, T0.Y, literal.y,
3069 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3070 ; EG-NEXT: 2(2.802597e-45), -65536(nan)
3071 ; EG-NEXT: OR_INT * T7.X, PV.W, PS,
3072 ; EG-NEXT: MOV T4.X, PV.X,
3073 ; EG-NEXT: MOV * T7.W, T3.X,
3074 ; EG-NEXT: MOV * T7.Y, T5.X,
3076 ; CI-LABEL: s_test_umin_ult_v8i16:
3078 ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
3079 ; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
3080 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3081 ; CI-NEXT: s_lshr_b32 s10, s0, 16
3082 ; CI-NEXT: s_and_b32 s0, s0, 0xffff
3083 ; CI-NEXT: s_lshr_b32 s11, s1, 16
3084 ; CI-NEXT: s_and_b32 s1, s1, 0xffff
3085 ; CI-NEXT: s_lshr_b32 s12, s2, 16
3086 ; CI-NEXT: s_and_b32 s2, s2, 0xffff
3087 ; CI-NEXT: s_lshr_b32 s13, s3, 16
3088 ; CI-NEXT: s_and_b32 s3, s3, 0xffff
3089 ; CI-NEXT: s_lshr_b32 s14, s4, 16
3090 ; CI-NEXT: s_and_b32 s4, s4, 0xffff
3091 ; CI-NEXT: s_lshr_b32 s15, s5, 16
3092 ; CI-NEXT: s_and_b32 s5, s5, 0xffff
3093 ; CI-NEXT: s_lshr_b32 s16, s6, 16
3094 ; CI-NEXT: s_and_b32 s6, s6, 0xffff
3095 ; CI-NEXT: s_lshr_b32 s17, s7, 16
3096 ; CI-NEXT: s_and_b32 s7, s7, 0xffff
3097 ; CI-NEXT: s_min_u32 s3, s3, s7
3098 ; CI-NEXT: s_min_u32 s7, s13, s17
3099 ; CI-NEXT: s_min_u32 s2, s2, s6
3100 ; CI-NEXT: s_min_u32 s6, s12, s16
3101 ; CI-NEXT: s_min_u32 s1, s1, s5
3102 ; CI-NEXT: s_min_u32 s5, s11, s15
3103 ; CI-NEXT: s_min_u32 s0, s0, s4
3104 ; CI-NEXT: s_min_u32 s4, s10, s14
3105 ; CI-NEXT: s_lshl_b32 s7, s7, 16
3106 ; CI-NEXT: s_lshl_b32 s6, s6, 16
3107 ; CI-NEXT: s_lshl_b32 s5, s5, 16
3108 ; CI-NEXT: s_lshl_b32 s4, s4, 16
3109 ; CI-NEXT: s_or_b32 s3, s3, s7
3110 ; CI-NEXT: s_or_b32 s2, s2, s6
3111 ; CI-NEXT: s_or_b32 s1, s1, s5
3112 ; CI-NEXT: s_or_b32 s0, s0, s4
3113 ; CI-NEXT: v_mov_b32_e32 v4, s8
3114 ; CI-NEXT: v_mov_b32_e32 v0, s0
3115 ; CI-NEXT: v_mov_b32_e32 v1, s1
3116 ; CI-NEXT: v_mov_b32_e32 v2, s2
3117 ; CI-NEXT: v_mov_b32_e32 v3, s3
3118 ; CI-NEXT: v_mov_b32_e32 v5, s9
3119 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3122 ; VI-LABEL: s_test_umin_ult_v8i16:
3124 ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
3125 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
3126 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3127 ; VI-NEXT: s_lshr_b32 s10, s3, 16
3128 ; VI-NEXT: s_and_b32 s3, s3, 0xffff
3129 ; VI-NEXT: s_lshr_b32 s11, s2, 16
3130 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
3131 ; VI-NEXT: s_lshr_b32 s12, s1, 16
3132 ; VI-NEXT: s_and_b32 s1, s1, 0xffff
3133 ; VI-NEXT: s_lshr_b32 s13, s0, 16
3134 ; VI-NEXT: s_and_b32 s0, s0, 0xffff
3135 ; VI-NEXT: s_lshr_b32 s14, s7, 16
3136 ; VI-NEXT: s_and_b32 s7, s7, 0xffff
3137 ; VI-NEXT: s_lshr_b32 s15, s6, 16
3138 ; VI-NEXT: s_and_b32 s6, s6, 0xffff
3139 ; VI-NEXT: s_lshr_b32 s16, s5, 16
3140 ; VI-NEXT: s_and_b32 s5, s5, 0xffff
3141 ; VI-NEXT: s_lshr_b32 s17, s4, 16
3142 ; VI-NEXT: s_and_b32 s4, s4, 0xffff
3143 ; VI-NEXT: s_min_u32 s0, s0, s4
3144 ; VI-NEXT: s_min_u32 s4, s13, s17
3145 ; VI-NEXT: s_min_u32 s1, s1, s5
3146 ; VI-NEXT: s_min_u32 s5, s12, s16
3147 ; VI-NEXT: s_min_u32 s2, s2, s6
3148 ; VI-NEXT: s_min_u32 s6, s11, s15
3149 ; VI-NEXT: s_min_u32 s3, s3, s7
3150 ; VI-NEXT: s_min_u32 s7, s10, s14
3151 ; VI-NEXT: s_lshl_b32 s7, s7, 16
3152 ; VI-NEXT: s_lshl_b32 s6, s6, 16
3153 ; VI-NEXT: s_lshl_b32 s5, s5, 16
3154 ; VI-NEXT: s_lshl_b32 s4, s4, 16
3155 ; VI-NEXT: s_or_b32 s3, s3, s7
3156 ; VI-NEXT: s_or_b32 s2, s2, s6
3157 ; VI-NEXT: s_or_b32 s1, s1, s5
3158 ; VI-NEXT: s_or_b32 s0, s0, s4
3159 ; VI-NEXT: v_mov_b32_e32 v4, s8
3160 ; VI-NEXT: v_mov_b32_e32 v0, s0
3161 ; VI-NEXT: v_mov_b32_e32 v1, s1
3162 ; VI-NEXT: v_mov_b32_e32 v2, s2
3163 ; VI-NEXT: v_mov_b32_e32 v3, s3
3164 ; VI-NEXT: v_mov_b32_e32 v5, s9
3165 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3168 ; GFX9-LABEL: s_test_umin_ult_v8i16:
3170 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
3171 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
3172 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
3173 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3174 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
3175 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3176 ; GFX9-NEXT: v_pk_min_u16 v3, s3, v0
3177 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
3178 ; GFX9-NEXT: v_pk_min_u16 v2, s2, v1
3179 ; GFX9-NEXT: v_pk_min_u16 v1, s1, v0
3180 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3181 ; GFX9-NEXT: v_pk_min_u16 v0, s0, v0
3182 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
3183 ; GFX9-NEXT: s_endpgm
3185 ; GFX10-LABEL: s_test_umin_ult_v8i16:
3187 ; GFX10-NEXT: s_clause 0x1
3188 ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
3189 ; GFX10-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
3190 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
3191 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3192 ; GFX10-NEXT: v_pk_min_u16 v3, s3, s7
3193 ; GFX10-NEXT: v_pk_min_u16 v2, s2, s6
3194 ; GFX10-NEXT: v_pk_min_u16 v1, s1, s5
3195 ; GFX10-NEXT: v_pk_min_u16 v0, s0, s4
3196 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11]
3197 ; GFX10-NEXT: s_endpgm
3199 ; GFX11-LABEL: s_test_umin_ult_v8i16:
3201 ; GFX11-NEXT: s_clause 0x1
3202 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x10
3203 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3204 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
3205 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3206 ; GFX11-NEXT: v_pk_min_u16 v3, s11, s15
3207 ; GFX11-NEXT: v_pk_min_u16 v2, s10, s14
3208 ; GFX11-NEXT: v_pk_min_u16 v1, s9, s13
3209 ; GFX11-NEXT: v_pk_min_u16 v0, s8, s12
3210 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
3211 ; GFX11-NEXT: s_endpgm
3212 %cmp = icmp ult <8 x i16> %a, %b
3213 %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
3214 store <8 x i16> %val, ptr addrspace(1) %out
3218 ; Make sure redundant and removed
3220 define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
3221 ; EG-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3223 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
3225 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
3226 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
3229 ; EG-NEXT: Fetch clause starting at 6:
3230 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 72, #3
3231 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 108, #3
3232 ; EG-NEXT: ALU clause starting at 10:
3233 ; EG-NEXT: MOV * T0.X, 0.0,
3234 ; EG-NEXT: ALU clause starting at 11:
3235 ; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
3236 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
3237 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3238 ; EG-NEXT: MIN_UINT T0.X, PV.Z, PV.W,
3239 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3240 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3242 ; CI-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3244 ; CI-NEXT: s_load_dword s2, s[8:9], 0xa
3245 ; CI-NEXT: s_load_dword s3, s[8:9], 0x13
3246 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
3247 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3248 ; CI-NEXT: s_and_b32 s2, s2, 0xffff
3249 ; CI-NEXT: s_and_b32 s3, s3, 0xffff
3250 ; CI-NEXT: s_min_u32 s2, s2, s3
3251 ; CI-NEXT: v_mov_b32_e32 v0, s0
3252 ; CI-NEXT: v_mov_b32_e32 v1, s1
3253 ; CI-NEXT: v_mov_b32_e32 v2, s2
3254 ; CI-NEXT: flat_store_dword v[0:1], v2
3257 ; VI-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3259 ; VI-NEXT: s_load_dword s2, s[8:9], 0x28
3260 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
3261 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
3262 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3263 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
3264 ; VI-NEXT: s_and_b32 s3, s3, 0xffff
3265 ; VI-NEXT: s_min_u32 s2, s2, s3
3266 ; VI-NEXT: v_mov_b32_e32 v0, s0
3267 ; VI-NEXT: v_mov_b32_e32 v1, s1
3268 ; VI-NEXT: v_mov_b32_e32 v2, s2
3269 ; VI-NEXT: flat_store_dword v[0:1], v2
3272 ; GFX9-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3274 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28
3275 ; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c
3276 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
3277 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3278 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3279 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
3280 ; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
3281 ; GFX9-NEXT: s_min_u32 s2, s2, s3
3282 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3283 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
3284 ; GFX9-NEXT: s_endpgm
3286 ; GFX10-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3288 ; GFX10-NEXT: s_clause 0x2
3289 ; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28
3290 ; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c
3291 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
3292 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
3293 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3294 ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
3295 ; GFX10-NEXT: s_and_b32 s3, s3, 0xffff
3296 ; GFX10-NEXT: s_min_u32 s2, s2, s3
3297 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
3298 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
3299 ; GFX10-NEXT: s_endpgm
3301 ; GFX11-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3303 ; GFX11-NEXT: s_clause 0x2
3304 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28
3305 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c
3306 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3307 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3308 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3309 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
3310 ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff
3311 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3312 ; GFX11-NEXT: s_min_u32 s2, s2, s3
3313 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
3314 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
3315 ; GFX11-NEXT: s_endpgm
3316 %a.ext = zext i16 %a to i32
3317 %b.ext = zext i16 %b to i32
3318 %cmp = icmp ult i32 %a.ext, %b.ext
3319 %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
3320 %mask = and i32 %val, 65535
3321 store i32 %mask, ptr addrspace(1) %out
3325 ; Make sure redundant sign_extend_inreg removed.
3327 define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace(1) %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 {
3328 ; EG-LABEL: simplify_demanded_bits_test_min_slt_i16:
3330 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
3332 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
3333 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
3336 ; EG-NEXT: Fetch clause starting at 6:
3337 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 72, #3
3338 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 108, #3
3339 ; EG-NEXT: ALU clause starting at 10:
3340 ; EG-NEXT: MOV * T0.X, 0.0,
3341 ; EG-NEXT: ALU clause starting at 11:
3342 ; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
3343 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
3344 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3345 ; EG-NEXT: MIN_INT T0.X, PV.Z, PV.W,
3346 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3347 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3349 ; CI-LABEL: simplify_demanded_bits_test_min_slt_i16:
3351 ; CI-NEXT: s_load_dword s2, s[8:9], 0xa
3352 ; CI-NEXT: s_load_dword s3, s[8:9], 0x13
3353 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
3354 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3355 ; CI-NEXT: s_sext_i32_i16 s2, s2
3356 ; CI-NEXT: s_sext_i32_i16 s3, s3
3357 ; CI-NEXT: s_min_i32 s2, s2, s3
3358 ; CI-NEXT: v_mov_b32_e32 v0, s0
3359 ; CI-NEXT: v_mov_b32_e32 v1, s1
3360 ; CI-NEXT: v_mov_b32_e32 v2, s2
3361 ; CI-NEXT: flat_store_dword v[0:1], v2
3364 ; VI-LABEL: simplify_demanded_bits_test_min_slt_i16:
3366 ; VI-NEXT: s_load_dword s2, s[8:9], 0x28
3367 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
3368 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
3369 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3370 ; VI-NEXT: s_sext_i32_i16 s2, s2
3371 ; VI-NEXT: s_sext_i32_i16 s3, s3
3372 ; VI-NEXT: s_min_i32 s2, s2, s3
3373 ; VI-NEXT: v_mov_b32_e32 v0, s0
3374 ; VI-NEXT: v_mov_b32_e32 v1, s1
3375 ; VI-NEXT: v_mov_b32_e32 v2, s2
3376 ; VI-NEXT: flat_store_dword v[0:1], v2
3379 ; GFX9-LABEL: simplify_demanded_bits_test_min_slt_i16:
3381 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28
3382 ; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c
3383 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
3384 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3385 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3386 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
3387 ; GFX9-NEXT: s_sext_i32_i16 s3, s3
3388 ; GFX9-NEXT: s_min_i32 s2, s2, s3
3389 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3390 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
3391 ; GFX9-NEXT: s_endpgm
3393 ; GFX10-LABEL: simplify_demanded_bits_test_min_slt_i16:
3395 ; GFX10-NEXT: s_clause 0x2
3396 ; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28
3397 ; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c
3398 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
3399 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
3400 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3401 ; GFX10-NEXT: s_sext_i32_i16 s2, s2
3402 ; GFX10-NEXT: s_sext_i32_i16 s3, s3
3403 ; GFX10-NEXT: s_min_i32 s2, s2, s3
3404 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
3405 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
3406 ; GFX10-NEXT: s_endpgm
3408 ; GFX11-LABEL: simplify_demanded_bits_test_min_slt_i16:
3410 ; GFX11-NEXT: s_clause 0x2
3411 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28
3412 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c
3413 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3414 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3415 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3416 ; GFX11-NEXT: s_sext_i32_i16 s2, s2
3417 ; GFX11-NEXT: s_sext_i32_i16 s3, s3
3418 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3419 ; GFX11-NEXT: s_min_i32 s2, s2, s3
3420 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
3421 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
3422 ; GFX11-NEXT: s_endpgm
3423 %a.ext = sext i16 %a to i32
3424 %b.ext = sext i16 %b to i32
3425 %cmp = icmp slt i32 %a.ext, %b.ext
3426 %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
3427 %shl = shl i32 %val, 16
3428 %sextinreg = ashr i32 %shl, 16
3429 store i32 %sextinreg, ptr addrspace(1) %out
3433 define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i16 %b) #0 {
3434 ; EG-LABEL: s_test_imin_sle_i16:
3436 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
3438 ; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
3439 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
3442 ; EG-NEXT: Fetch clause starting at 6:
3443 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
3444 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 42, #3
3445 ; EG-NEXT: ALU clause starting at 10:
3446 ; EG-NEXT: MOV * T0.X, 0.0,
3447 ; EG-NEXT: ALU clause starting at 11:
3448 ; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
3449 ; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
3450 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
3451 ; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45)
3452 ; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
3453 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
3454 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
3455 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
3456 ; EG-NEXT: LSHL T0.X, PV.W, PS,
3457 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
3458 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3459 ; EG-NEXT: MOV T0.Y, 0.0,
3460 ; EG-NEXT: MOV * T0.Z, 0.0,
3461 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3462 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3464 ; CI-LABEL: s_test_imin_sle_i16:
3466 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2
3467 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
3468 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3469 ; CI-NEXT: s_sext_i32_i16 s3, s2
3470 ; CI-NEXT: s_ashr_i32 s2, s2, 16
3471 ; CI-NEXT: s_min_i32 s2, s3, s2
3472 ; CI-NEXT: v_mov_b32_e32 v0, s0
3473 ; CI-NEXT: v_mov_b32_e32 v1, s1
3474 ; CI-NEXT: v_mov_b32_e32 v2, s2
3475 ; CI-NEXT: flat_store_short v[0:1], v2
3478 ; VI-LABEL: s_test_imin_sle_i16:
3480 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
3481 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
3482 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3483 ; VI-NEXT: s_sext_i32_i16 s3, s2
3484 ; VI-NEXT: s_ashr_i32 s2, s2, 16
3485 ; VI-NEXT: s_min_i32 s2, s3, s2
3486 ; VI-NEXT: v_mov_b32_e32 v0, s0
3487 ; VI-NEXT: v_mov_b32_e32 v1, s1
3488 ; VI-NEXT: v_mov_b32_e32 v2, s2
3489 ; VI-NEXT: flat_store_short v[0:1], v2
3492 ; GFX9-LABEL: s_test_imin_sle_i16:
3494 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
3495 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
3496 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3497 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3498 ; GFX9-NEXT: s_sext_i32_i16 s3, s2
3499 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16
3500 ; GFX9-NEXT: s_min_i32 s2, s3, s2
3501 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3502 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
3503 ; GFX9-NEXT: s_endpgm
3505 ; GFX10-LABEL: s_test_imin_sle_i16:
3507 ; GFX10-NEXT: s_clause 0x1
3508 ; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
3509 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
3510 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
3511 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3512 ; GFX10-NEXT: s_sext_i32_i16 s3, s2
3513 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16
3514 ; GFX10-NEXT: s_min_i32 s2, s3, s2
3515 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
3516 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
3517 ; GFX10-NEXT: s_endpgm
3519 ; GFX11-LABEL: s_test_imin_sle_i16:
3521 ; GFX11-NEXT: s_clause 0x1
3522 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
3523 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3524 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3525 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3526 ; GFX11-NEXT: s_sext_i32_i16 s3, s2
3527 ; GFX11-NEXT: s_ashr_i32 s2, s2, 16
3528 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3529 ; GFX11-NEXT: s_min_i32 s2, s3, s2
3530 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
3531 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
3532 ; GFX11-NEXT: s_endpgm
3533 %cmp = icmp sle i16 %a, %b
3534 %val = select i1 %cmp, i16 %a, i16 %b
3535 store i16 %val, ptr addrspace(1) %out
3541 define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3542 ; EG-LABEL: test_umin_ult_i64:
3544 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
3545 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3548 ; EG-NEXT: ALU clause starting at 4:
3549 ; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3550 ; EG-NEXT: SETGT_UINT * T0.W, KC0[3].Z, KC0[3].X,
3551 ; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3552 ; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3553 ; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3554 ; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3555 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3556 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3558 ; CI-LABEL: test_umin_ult_i64:
3560 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3561 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
3562 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3563 ; CI-NEXT: v_mov_b32_e32 v0, s0
3564 ; CI-NEXT: v_mov_b32_e32 v1, s4
3565 ; CI-NEXT: v_mov_b32_e32 v2, s5
3566 ; CI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
3567 ; CI-NEXT: v_mov_b32_e32 v1, s1
3568 ; CI-NEXT: s_and_b64 s[0:1], vcc, exec
3569 ; CI-NEXT: s_cselect_b32 s0, s3, s5
3570 ; CI-NEXT: s_cselect_b32 s1, s2, s4
3571 ; CI-NEXT: v_mov_b32_e32 v2, s1
3572 ; CI-NEXT: v_mov_b32_e32 v3, s0
3573 ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3576 ; VI-LABEL: test_umin_ult_i64:
3578 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3579 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
3580 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3581 ; VI-NEXT: v_mov_b32_e32 v0, s0
3582 ; VI-NEXT: v_mov_b32_e32 v1, s4
3583 ; VI-NEXT: v_mov_b32_e32 v2, s5
3584 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
3585 ; VI-NEXT: v_mov_b32_e32 v1, s1
3586 ; VI-NEXT: s_and_b64 s[0:1], vcc, exec
3587 ; VI-NEXT: s_cselect_b32 s0, s3, s5
3588 ; VI-NEXT: s_cselect_b32 s1, s2, s4
3589 ; VI-NEXT: v_mov_b32_e32 v2, s1
3590 ; VI-NEXT: v_mov_b32_e32 v3, s0
3591 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3594 ; GFX9-LABEL: test_umin_ult_i64:
3596 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
3597 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3598 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3599 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3600 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3601 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3602 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3603 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
3604 ; GFX9-NEXT: s_cselect_b32 s3, s3, s5
3605 ; GFX9-NEXT: s_cselect_b32 s2, s2, s4
3606 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3607 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3608 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3609 ; GFX9-NEXT: s_endpgm
3611 ; GFX10-LABEL: test_umin_ult_i64:
3613 ; GFX10-NEXT: s_clause 0x1
3614 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3615 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
3616 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3617 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3618 ; GFX10-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5]
3619 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
3620 ; GFX10-NEXT: s_cselect_b32 s2, s2, s4
3621 ; GFX10-NEXT: s_cselect_b32 s3, s3, s5
3622 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
3623 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
3624 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3625 ; GFX10-NEXT: s_endpgm
3627 ; GFX11-LABEL: test_umin_ult_i64:
3629 ; GFX11-NEXT: s_clause 0x1
3630 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
3631 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
3632 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
3633 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3634 ; GFX11-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5]
3635 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3636 ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
3637 ; GFX11-NEXT: s_cselect_b32 s2, s2, s4
3638 ; GFX11-NEXT: s_cselect_b32 s3, s3, s5
3639 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
3640 ; GFX11-NEXT: v_mov_b32_e32 v1, s3
3641 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
3642 ; GFX11-NEXT: s_endpgm
3643 %tmp = icmp ult i64 %a, %b
3644 %val = select i1 %tmp, i64 %a, i64 %b
3645 store i64 %val, ptr addrspace(1) %out, align 8
3649 define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3650 ; EG-LABEL: test_umin_ule_i64:
3652 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
3653 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3656 ; EG-NEXT: ALU clause starting at 4:
3657 ; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3658 ; EG-NEXT: SETGT_UINT * T0.W, KC0[3].Z, KC0[3].X,
3659 ; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3660 ; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3661 ; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3662 ; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3663 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3664 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3666 ; CI-LABEL: test_umin_ule_i64:
3668 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3669 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
3670 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3671 ; CI-NEXT: v_mov_b32_e32 v0, s0
3672 ; CI-NEXT: v_mov_b32_e32 v1, s4
3673 ; CI-NEXT: v_mov_b32_e32 v2, s5
3674 ; CI-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[1:2]
3675 ; CI-NEXT: v_mov_b32_e32 v1, s1
3676 ; CI-NEXT: s_and_b64 s[0:1], vcc, exec
3677 ; CI-NEXT: s_cselect_b32 s0, s3, s5
3678 ; CI-NEXT: s_cselect_b32 s1, s2, s4
3679 ; CI-NEXT: v_mov_b32_e32 v2, s1
3680 ; CI-NEXT: v_mov_b32_e32 v3, s0
3681 ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3684 ; VI-LABEL: test_umin_ule_i64:
3686 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3687 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
3688 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3689 ; VI-NEXT: v_mov_b32_e32 v0, s0
3690 ; VI-NEXT: v_mov_b32_e32 v1, s4
3691 ; VI-NEXT: v_mov_b32_e32 v2, s5
3692 ; VI-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[1:2]
3693 ; VI-NEXT: v_mov_b32_e32 v1, s1
3694 ; VI-NEXT: s_and_b64 s[0:1], vcc, exec
3695 ; VI-NEXT: s_cselect_b32 s0, s3, s5
3696 ; VI-NEXT: s_cselect_b32 s1, s2, s4
3697 ; VI-NEXT: v_mov_b32_e32 v2, s1
3698 ; VI-NEXT: v_mov_b32_e32 v3, s0
3699 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3702 ; GFX9-LABEL: test_umin_ule_i64:
3704 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
3705 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3706 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3707 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3708 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3709 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3710 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[0:1]
3711 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
3712 ; GFX9-NEXT: s_cselect_b32 s3, s3, s5
3713 ; GFX9-NEXT: s_cselect_b32 s2, s2, s4
3714 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3715 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3716 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3717 ; GFX9-NEXT: s_endpgm
3719 ; GFX10-LABEL: test_umin_ule_i64:
3721 ; GFX10-NEXT: s_clause 0x1
3722 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3723 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
3724 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3725 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3726 ; GFX10-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5]
3727 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
3728 ; GFX10-NEXT: s_cselect_b32 s2, s2, s4
3729 ; GFX10-NEXT: s_cselect_b32 s3, s3, s5
3730 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
3731 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
3732 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3733 ; GFX10-NEXT: s_endpgm
3735 ; GFX11-LABEL: test_umin_ule_i64:
3737 ; GFX11-NEXT: s_clause 0x1
3738 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
3739 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
3740 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
3741 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3742 ; GFX11-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5]
3743 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3744 ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
3745 ; GFX11-NEXT: s_cselect_b32 s2, s2, s4
3746 ; GFX11-NEXT: s_cselect_b32 s3, s3, s5
3747 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
3748 ; GFX11-NEXT: v_mov_b32_e32 v1, s3
3749 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
3750 ; GFX11-NEXT: s_endpgm
3751 %tmp = icmp ule i64 %a, %b
3752 %val = select i1 %tmp, i64 %a, i64 %b
3753 store i64 %val, ptr addrspace(1) %out, align 8
3757 define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3758 ; EG-LABEL: test_imin_slt_i64:
3760 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
3761 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3764 ; EG-NEXT: ALU clause starting at 4:
3765 ; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3766 ; EG-NEXT: SETGT_INT * T0.W, KC0[3].Z, KC0[3].X,
3767 ; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3768 ; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3769 ; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3770 ; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3771 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3772 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3774 ; CI-LABEL: test_imin_slt_i64:
3776 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3777 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
3778 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3779 ; CI-NEXT: v_mov_b32_e32 v0, s0
3780 ; CI-NEXT: v_mov_b32_e32 v1, s4
3781 ; CI-NEXT: v_mov_b32_e32 v2, s5
3782 ; CI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
3783 ; CI-NEXT: v_mov_b32_e32 v1, s1
3784 ; CI-NEXT: s_and_b64 s[0:1], vcc, exec
3785 ; CI-NEXT: s_cselect_b32 s0, s3, s5
3786 ; CI-NEXT: s_cselect_b32 s1, s2, s4
3787 ; CI-NEXT: v_mov_b32_e32 v2, s1
3788 ; CI-NEXT: v_mov_b32_e32 v3, s0
3789 ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3792 ; VI-LABEL: test_imin_slt_i64:
3794 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3795 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
3796 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3797 ; VI-NEXT: v_mov_b32_e32 v0, s0
3798 ; VI-NEXT: v_mov_b32_e32 v1, s4
3799 ; VI-NEXT: v_mov_b32_e32 v2, s5
3800 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
3801 ; VI-NEXT: v_mov_b32_e32 v1, s1
3802 ; VI-NEXT: s_and_b64 s[0:1], vcc, exec
3803 ; VI-NEXT: s_cselect_b32 s0, s3, s5
3804 ; VI-NEXT: s_cselect_b32 s1, s2, s4
3805 ; VI-NEXT: v_mov_b32_e32 v2, s1
3806 ; VI-NEXT: v_mov_b32_e32 v3, s0
3807 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3810 ; GFX9-LABEL: test_imin_slt_i64:
3812 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
3813 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3814 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3815 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3816 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3817 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3818 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
3819 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
3820 ; GFX9-NEXT: s_cselect_b32 s3, s3, s5
3821 ; GFX9-NEXT: s_cselect_b32 s2, s2, s4
3822 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3823 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3824 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3825 ; GFX9-NEXT: s_endpgm
3827 ; GFX10-LABEL: test_imin_slt_i64:
3829 ; GFX10-NEXT: s_clause 0x1
3830 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3831 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
3832 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3833 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3834 ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5]
3835 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
3836 ; GFX10-NEXT: s_cselect_b32 s2, s2, s4
3837 ; GFX10-NEXT: s_cselect_b32 s3, s3, s5
3838 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
3839 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
3840 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3841 ; GFX10-NEXT: s_endpgm
3843 ; GFX11-LABEL: test_imin_slt_i64:
3845 ; GFX11-NEXT: s_clause 0x1
3846 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
3847 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
3848 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
3849 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3850 ; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5]
3851 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3852 ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
3853 ; GFX11-NEXT: s_cselect_b32 s2, s2, s4
3854 ; GFX11-NEXT: s_cselect_b32 s3, s3, s5
3855 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
3856 ; GFX11-NEXT: v_mov_b32_e32 v1, s3
3857 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
3858 ; GFX11-NEXT: s_endpgm
3859 %tmp = icmp slt i64 %a, %b
3860 %val = select i1 %tmp, i64 %a, i64 %b
3861 store i64 %val, ptr addrspace(1) %out, align 8
3865 define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3866 ; EG-LABEL: test_imin_sle_i64:
3868 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
3869 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3872 ; EG-NEXT: ALU clause starting at 4:
3873 ; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3874 ; EG-NEXT: SETGT_INT * T0.W, KC0[3].Z, KC0[3].X,
3875 ; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3876 ; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3877 ; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3878 ; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3879 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3880 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3882 ; CI-LABEL: test_imin_sle_i64:
3884 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3885 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
3886 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3887 ; CI-NEXT: v_mov_b32_e32 v0, s0
3888 ; CI-NEXT: v_mov_b32_e32 v1, s4
3889 ; CI-NEXT: v_mov_b32_e32 v2, s5
3890 ; CI-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[1:2]
3891 ; CI-NEXT: v_mov_b32_e32 v1, s1
3892 ; CI-NEXT: s_and_b64 s[0:1], vcc, exec
3893 ; CI-NEXT: s_cselect_b32 s0, s3, s5
3894 ; CI-NEXT: s_cselect_b32 s1, s2, s4
3895 ; CI-NEXT: v_mov_b32_e32 v2, s1
3896 ; CI-NEXT: v_mov_b32_e32 v3, s0
3897 ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3900 ; VI-LABEL: test_imin_sle_i64:
3902 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3903 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
3904 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3905 ; VI-NEXT: v_mov_b32_e32 v0, s0
3906 ; VI-NEXT: v_mov_b32_e32 v1, s4
3907 ; VI-NEXT: v_mov_b32_e32 v2, s5
3908 ; VI-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[1:2]
3909 ; VI-NEXT: v_mov_b32_e32 v1, s1
3910 ; VI-NEXT: s_and_b64 s[0:1], vcc, exec
3911 ; VI-NEXT: s_cselect_b32 s0, s3, s5
3912 ; VI-NEXT: s_cselect_b32 s1, s2, s4
3913 ; VI-NEXT: v_mov_b32_e32 v2, s1
3914 ; VI-NEXT: v_mov_b32_e32 v3, s0
3915 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3918 ; GFX9-LABEL: test_imin_sle_i64:
3920 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
3921 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3922 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3923 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3924 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3925 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3926 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[0:1]
3927 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
3928 ; GFX9-NEXT: s_cselect_b32 s3, s3, s5
3929 ; GFX9-NEXT: s_cselect_b32 s2, s2, s4
3930 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3931 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3932 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3933 ; GFX9-NEXT: s_endpgm
3935 ; GFX10-LABEL: test_imin_sle_i64:
3937 ; GFX10-NEXT: s_clause 0x1
3938 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
3939 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
3940 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3941 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3942 ; GFX10-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5]
3943 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
3944 ; GFX10-NEXT: s_cselect_b32 s2, s2, s4
3945 ; GFX10-NEXT: s_cselect_b32 s3, s3, s5
3946 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
3947 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
3948 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3949 ; GFX10-NEXT: s_endpgm
3951 ; GFX11-LABEL: test_imin_sle_i64:
3953 ; GFX11-NEXT: s_clause 0x1
3954 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
3955 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
3956 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
3957 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3958 ; GFX11-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5]
3959 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3960 ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
3961 ; GFX11-NEXT: s_cselect_b32 s2, s2, s4
3962 ; GFX11-NEXT: s_cselect_b32 s3, s3, s5
3963 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
3964 ; GFX11-NEXT: v_mov_b32_e32 v1, s3
3965 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
3966 ; GFX11-NEXT: s_endpgm
3967 %tmp = icmp sle i64 %a, %b
3968 %val = select i1 %tmp, i64 %a, i64 %b
3969 store i64 %val, ptr addrspace(1) %out, align 8
3973 define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
3974 ; EG-LABEL: v_test_imin_sle_v2i16:
3976 ; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
3978 ; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
3979 ; EG-NEXT: TEX 0 @10
3980 ; EG-NEXT: ALU 16, @16, KC0[CB0:0-32], KC1[]
3981 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
3984 ; EG-NEXT: Fetch clause starting at 8:
3985 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
3986 ; EG-NEXT: Fetch clause starting at 10:
3987 ; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
3988 ; EG-NEXT: ALU clause starting at 12:
3989 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
3990 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3991 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
3992 ; EG-NEXT: ALU clause starting at 15:
3993 ; EG-NEXT: ADD_INT * T7.X, KC0[2].W, T0.W,
3994 ; EG-NEXT: ALU clause starting at 16:
3995 ; EG-NEXT: LSHR T1.W, T0.X, literal.x,
3996 ; EG-NEXT: LSHR * T2.W, T7.X, literal.x,
3997 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3998 ; EG-NEXT: BFE_INT T8.X, PS, 0.0, literal.x,
3999 ; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
4000 ; EG-NEXT: BFE_INT T0.Z, T7.X, 0.0, literal.x,
4001 ; EG-NEXT: BFE_INT * T1.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
4002 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4003 ; EG-NEXT: MIN_INT T1.W, PV.W, PV.Z,
4004 ; EG-NEXT: MIN_INT * T2.W, PV.Y, PV.X,
4005 ; EG-NEXT: LSHL T2.W, PS, literal.x,
4006 ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
4007 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
4008 ; EG-NEXT: OR_INT T0.X, PS, PV.W,
4009 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
4010 ; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
4011 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4013 ; CI-LABEL: v_test_imin_sle_v2i16:
4015 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4016 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
4017 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
4018 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4019 ; CI-NEXT: v_mov_b32_e32 v1, s3
4020 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
4021 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4022 ; CI-NEXT: v_mov_b32_e32 v3, s5
4023 ; CI-NEXT: flat_load_dword v4, v[0:1]
4024 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
4025 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
4026 ; CI-NEXT: flat_load_dword v3, v[0:1]
4027 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
4028 ; CI-NEXT: v_mov_b32_e32 v1, s1
4029 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4030 ; CI-NEXT: s_waitcnt vmcnt(1)
4031 ; CI-NEXT: v_bfe_i32 v2, v4, 0, 16
4032 ; CI-NEXT: v_ashrrev_i32_e32 v4, 16, v4
4033 ; CI-NEXT: s_waitcnt vmcnt(0)
4034 ; CI-NEXT: v_bfe_i32 v5, v3, 0, 16
4035 ; CI-NEXT: v_ashrrev_i32_e32 v3, 16, v3
4036 ; CI-NEXT: v_min_i32_e32 v3, v4, v3
4037 ; CI-NEXT: v_min_i32_e32 v2, v2, v5
4038 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
4039 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
4040 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
4041 ; CI-NEXT: flat_store_dword v[0:1], v2
4044 ; VI-LABEL: v_test_imin_sle_v2i16:
4046 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4047 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
4048 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
4049 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4050 ; VI-NEXT: v_mov_b32_e32 v1, s3
4051 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
4052 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4053 ; VI-NEXT: v_mov_b32_e32 v3, s5
4054 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
4055 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
4056 ; VI-NEXT: flat_load_dword v5, v[0:1]
4057 ; VI-NEXT: flat_load_dword v2, v[2:3]
4058 ; VI-NEXT: v_mov_b32_e32 v1, s1
4059 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
4060 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4061 ; VI-NEXT: s_waitcnt vmcnt(0)
4062 ; VI-NEXT: v_min_i16_e32 v3, v5, v2
4063 ; VI-NEXT: v_min_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4064 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
4065 ; VI-NEXT: flat_store_dword v[0:1], v2
4068 ; GFX9-LABEL: v_test_imin_sle_v2i16:
4070 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4071 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
4072 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4073 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4074 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
4075 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
4076 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4077 ; GFX9-NEXT: v_pk_min_i16 v1, v1, v2
4078 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
4079 ; GFX9-NEXT: s_endpgm
4081 ; GFX10-LABEL: v_test_imin_sle_v2i16:
4083 ; GFX10-NEXT: s_clause 0x1
4084 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4085 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
4086 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4087 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4088 ; GFX10-NEXT: s_clause 0x1
4089 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
4090 ; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
4091 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4092 ; GFX10-NEXT: v_pk_min_i16 v1, v1, v2
4093 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
4094 ; GFX10-NEXT: s_endpgm
4096 ; GFX11-LABEL: v_test_imin_sle_v2i16:
4098 ; GFX11-NEXT: s_clause 0x1
4099 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
4100 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
4101 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
4102 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4103 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4104 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4105 ; GFX11-NEXT: s_clause 0x1
4106 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
4107 ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
4108 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4109 ; GFX11-NEXT: v_pk_min_i16 v1, v1, v2
4110 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
4111 ; GFX11-NEXT: s_endpgm
4112 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4113 %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
4114 %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
4115 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
4116 %a = load <2 x i16>, ptr addrspace(1) %a.gep
4117 %b = load <2 x i16>, ptr addrspace(1) %b.gep
4118 %cmp = icmp sle <2 x i16> %a, %b
4119 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
4120 store <2 x i16> %val, ptr addrspace(1) %out.gep
4126 define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
4127 ; EG-LABEL: v_test_imin_ule_v2i16:
4129 ; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
4131 ; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
4132 ; EG-NEXT: TEX 0 @10
4133 ; EG-NEXT: ALU 13, @16, KC0[CB0:0-32], KC1[]
4134 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
4137 ; EG-NEXT: Fetch clause starting at 8:
4138 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
4139 ; EG-NEXT: Fetch clause starting at 10:
4140 ; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
4141 ; EG-NEXT: ALU clause starting at 12:
4142 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
4143 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4144 ; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W,
4145 ; EG-NEXT: ALU clause starting at 15:
4146 ; EG-NEXT: ADD_INT * T7.X, KC0[2].Z, T0.W,
4147 ; EG-NEXT: ALU clause starting at 16:
4148 ; EG-NEXT: LSHR T1.W, T0.X, literal.x,
4149 ; EG-NEXT: LSHR * T2.W, T7.X, literal.x,
4150 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4151 ; EG-NEXT: AND_INT T0.Z, T0.X, literal.x,
4152 ; EG-NEXT: AND_INT T3.W, T7.X, literal.x, BS:VEC_120/SCL_212
4153 ; EG-NEXT: MIN_UINT * T1.W, PS, PV.W,
4154 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
4155 ; EG-NEXT: LSHL T1.W, PS, literal.x,
4156 ; EG-NEXT: MIN_UINT * T2.W, PV.W, PV.Z,
4157 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4158 ; EG-NEXT: OR_INT T0.X, PS, PV.W,
4159 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
4160 ; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
4161 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4163 ; CI-LABEL: v_test_imin_ule_v2i16:
4165 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4166 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
4167 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
4168 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4169 ; CI-NEXT: v_mov_b32_e32 v1, s3
4170 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
4171 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4172 ; CI-NEXT: v_mov_b32_e32 v3, s5
4173 ; CI-NEXT: flat_load_dword v4, v[0:1]
4174 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
4175 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
4176 ; CI-NEXT: flat_load_dword v3, v[0:1]
4177 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
4178 ; CI-NEXT: v_mov_b32_e32 v1, s1
4179 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4180 ; CI-NEXT: s_waitcnt vmcnt(1)
4181 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
4182 ; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4
4183 ; CI-NEXT: s_waitcnt vmcnt(0)
4184 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
4185 ; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3
4186 ; CI-NEXT: v_min_u32_e32 v2, v2, v5
4187 ; CI-NEXT: v_min_u32_e32 v3, v4, v3
4188 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
4189 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
4190 ; CI-NEXT: flat_store_dword v[0:1], v2
4193 ; VI-LABEL: v_test_imin_ule_v2i16:
4195 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4196 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
4197 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
4198 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4199 ; VI-NEXT: v_mov_b32_e32 v1, s3
4200 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
4201 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4202 ; VI-NEXT: v_mov_b32_e32 v3, s5
4203 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
4204 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
4205 ; VI-NEXT: flat_load_dword v5, v[0:1]
4206 ; VI-NEXT: flat_load_dword v2, v[2:3]
4207 ; VI-NEXT: v_mov_b32_e32 v1, s1
4208 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
4209 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4210 ; VI-NEXT: s_waitcnt vmcnt(0)
4211 ; VI-NEXT: v_min_u16_e32 v3, v5, v2
4212 ; VI-NEXT: v_min_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4213 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
4214 ; VI-NEXT: flat_store_dword v[0:1], v2
4217 ; GFX9-LABEL: v_test_imin_ule_v2i16:
4219 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4220 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
4221 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4222 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4223 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
4224 ; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
4225 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4226 ; GFX9-NEXT: v_pk_min_u16 v1, v1, v2
4227 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
4228 ; GFX9-NEXT: s_endpgm
4230 ; GFX10-LABEL: v_test_imin_ule_v2i16:
4232 ; GFX10-NEXT: s_clause 0x1
4233 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4234 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
4235 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4236 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4237 ; GFX10-NEXT: s_clause 0x1
4238 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
4239 ; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
4240 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4241 ; GFX10-NEXT: v_pk_min_u16 v1, v1, v2
4242 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
4243 ; GFX10-NEXT: s_endpgm
4245 ; GFX11-LABEL: v_test_imin_ule_v2i16:
4247 ; GFX11-NEXT: s_clause 0x1
4248 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
4249 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
4250 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
4251 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4252 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4253 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4254 ; GFX11-NEXT: s_clause 0x1
4255 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
4256 ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5]
4257 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4258 ; GFX11-NEXT: v_pk_min_u16 v1, v1, v2
4259 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
4260 ; GFX11-NEXT: s_endpgm
4261 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4262 %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
4263 %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
4264 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
4265 %a = load <2 x i16>, ptr addrspace(1) %a.gep
4266 %b = load <2 x i16>, ptr addrspace(1) %b.gep
4267 %cmp = icmp ule <2 x i16> %a, %b
4268 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
4269 store <2 x i16> %val, ptr addrspace(1) %out.gep
4273 declare i32 @llvm.amdgcn.workitem.id.x() #1
4275 attributes #0 = { nounwind }
4276 attributes #1 = { nounwind readnone }