1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
7 declare i32 @llvm.amdgcn.workitem.id.x() #1
8 declare half @llvm.fabs.f16(half)
9 declare float @llvm.fabs.f32(float)
10 declare double @llvm.fabs.f64(double)
12 ; All nan values are converted to 0xffffffff
13 define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
14 ; SI-LABEL: v_cnd_nan_nosgpr:
16 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
17 ; SI-NEXT: s_load_dword s8, s[4:5], 0xb
18 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
19 ; SI-NEXT: s_mov_b32 s3, 0xf000
20 ; SI-NEXT: s_mov_b32 s6, 0
21 ; SI-NEXT: s_mov_b32 s7, s3
22 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
23 ; SI-NEXT: v_mov_b32_e32 v1, 0
24 ; SI-NEXT: s_waitcnt lgkmcnt(0)
25 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
26 ; SI-NEXT: s_mov_b32 s2, -1
27 ; SI-NEXT: s_cmp_eq_u32 s8, 0
28 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
29 ; SI-NEXT: s_waitcnt vmcnt(0)
30 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
34 ; VI-LABEL: v_cnd_nan_nosgpr:
36 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
37 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
38 ; VI-NEXT: s_waitcnt lgkmcnt(0)
39 ; VI-NEXT: v_mov_b32_e32 v1, s1
40 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
41 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
42 ; VI-NEXT: flat_load_dword v0, v[0:1]
43 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
44 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
45 ; VI-NEXT: s_waitcnt lgkmcnt(0)
46 ; VI-NEXT: s_cmp_eq_u32 s2, 0
47 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
48 ; VI-NEXT: s_waitcnt vmcnt(0)
49 ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
50 ; VI-NEXT: v_mov_b32_e32 v0, s0
51 ; VI-NEXT: v_mov_b32_e32 v1, s1
52 ; VI-NEXT: flat_store_dword v[0:1], v2
55 ; GFX10-LABEL: v_cnd_nan_nosgpr:
57 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
58 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
59 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
60 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
61 ; GFX10-NEXT: global_load_dword v0, v0, s[0:1]
62 ; GFX10-NEXT: s_clause 0x1
63 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
64 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
65 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
66 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
67 ; GFX10-NEXT: s_cmp_eq_u32 s2, 0
68 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0
69 ; GFX10-NEXT: s_waitcnt vmcnt(0)
70 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
71 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
72 ; GFX10-NEXT: s_endpgm
74 ; GFX11-LABEL: v_cnd_nan_nosgpr:
76 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
77 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
78 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
79 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
80 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
81 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
82 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
83 ; GFX11-NEXT: s_clause 0x1
84 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
85 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
86 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
87 ; GFX11-NEXT: s_cmp_eq_u32 s2, 0
88 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
89 ; GFX11-NEXT: s_waitcnt vmcnt(0)
90 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
91 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
92 ; GFX11-NEXT: s_endpgm
93 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
94 %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
95 %f = load float, ptr addrspace(1) %f.gep
96 %setcc = icmp ne i32 %c, 0
97 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
98 store float %select, ptr addrspace(1) %out
102 ; This requires slightly trickier SGPR operand legalization since the
103 ; single constant bus SGPR usage is the last operand, and it should
105 ; However on GFX10 constant bus is limited to 2 scalar operands, not one.
106 ; All nan values are converted to 0xffffffff
107 define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 {
108 ; SI-LABEL: v_cnd_nan:
110 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
111 ; SI-NEXT: s_mov_b32 s7, 0xf000
112 ; SI-NEXT: s_mov_b32 s6, -1
113 ; SI-NEXT: s_waitcnt lgkmcnt(0)
114 ; SI-NEXT: s_mov_b32 s4, s0
115 ; SI-NEXT: s_mov_b32 s5, s1
116 ; SI-NEXT: s_cmp_eq_u32 s2, 0
117 ; SI-NEXT: v_mov_b32_e32 v0, s3
118 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
119 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
120 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
123 ; VI-LABEL: v_cnd_nan:
125 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
126 ; VI-NEXT: s_waitcnt lgkmcnt(0)
127 ; VI-NEXT: s_cmp_eq_u32 s2, 0
128 ; VI-NEXT: v_mov_b32_e32 v0, s3
129 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
130 ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
131 ; VI-NEXT: v_mov_b32_e32 v0, s0
132 ; VI-NEXT: v_mov_b32_e32 v1, s1
133 ; VI-NEXT: flat_store_dword v[0:1], v2
136 ; GFX10-LABEL: v_cnd_nan:
138 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
139 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
140 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
141 ; GFX10-NEXT: s_cmp_eq_u32 s2, 0
142 ; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0
143 ; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5]
144 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
145 ; GFX10-NEXT: s_endpgm
147 ; GFX11-LABEL: v_cnd_nan:
149 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
150 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
151 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
152 ; GFX11-NEXT: s_cmp_eq_u32 s2, 0
153 ; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0
154 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
155 ; GFX11-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5]
156 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
157 ; GFX11-NEXT: s_endpgm
158 %setcc = icmp ne i32 %c, 0
159 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
160 store float %select, ptr addrspace(1) %out
164 ; Test different compare and select operand types for optimal code
166 ; (select (cmp (sgprX, constant)), constant, sgprZ)
167 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
168 ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
170 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
171 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13
172 ; SI-NEXT: s_mov_b32 s3, 0xf000
173 ; SI-NEXT: s_mov_b32 s2, 0
174 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
175 ; SI-NEXT: v_mov_b32_e32 v1, 0
176 ; SI-NEXT: s_waitcnt lgkmcnt(0)
177 ; SI-NEXT: v_mov_b32_e32 v2, s5
178 ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
179 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
180 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
183 ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
185 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
186 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c
187 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
188 ; VI-NEXT: s_waitcnt lgkmcnt(0)
189 ; VI-NEXT: v_mov_b32_e32 v1, s1
190 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
191 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
192 ; VI-NEXT: v_mov_b32_e32 v2, s3
193 ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
194 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
195 ; VI-NEXT: flat_store_dword v[0:1], v2
198 ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
200 ; GFX10-NEXT: s_clause 0x1
201 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
202 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
203 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
204 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
205 ; GFX10-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0
206 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5]
207 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
208 ; GFX10-NEXT: s_endpgm
210 ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
212 ; GFX11-NEXT: s_clause 0x1
213 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c
214 ; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
215 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
216 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
217 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
218 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
219 ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0
220 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5]
221 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
222 ; GFX11-NEXT: s_endpgm
223 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
224 %tid.ext = sext i32 %tid to i64
225 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
226 %setcc = fcmp one float %x, 0.0
227 %select = select i1 %setcc, float 1.0, float %z
228 store float %select, ptr addrspace(1) %out.gep
232 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
233 ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
235 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
236 ; SI-NEXT: s_load_dword s4, s[4:5], 0xb
237 ; SI-NEXT: s_mov_b32 s3, 0xf000
238 ; SI-NEXT: s_mov_b32 s2, 0
239 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
240 ; SI-NEXT: v_mov_b32_e32 v1, 0
241 ; SI-NEXT: s_waitcnt lgkmcnt(0)
242 ; SI-NEXT: v_mov_b32_e32 v2, s4
243 ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
244 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
245 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
248 ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
250 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
251 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
252 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
253 ; VI-NEXT: s_waitcnt lgkmcnt(0)
254 ; VI-NEXT: v_mov_b32_e32 v1, s1
255 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
256 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
257 ; VI-NEXT: v_mov_b32_e32 v2, s2
258 ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
259 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
260 ; VI-NEXT: flat_store_dword v[0:1], v2
263 ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
265 ; GFX10-NEXT: s_clause 0x1
266 ; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c
267 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
268 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
269 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
270 ; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0
271 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s6, s[2:3]
272 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
273 ; GFX10-NEXT: s_endpgm
275 ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
277 ; GFX11-NEXT: s_clause 0x1
278 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
279 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
280 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
281 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
282 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
283 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
284 ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0
285 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s6, s[2:3]
286 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
287 ; GFX11-NEXT: s_endpgm
288 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
289 %tid.ext = sext i32 %tid to i64
290 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
291 %setcc = fcmp one float %x, 0.0
292 %select = select i1 %setcc, float 1.0, float %x
293 store float %select, ptr addrspace(1) %out.gep
297 define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
298 ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
300 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
301 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13
302 ; SI-NEXT: s_mov_b32 s3, 0xf000
303 ; SI-NEXT: s_mov_b32 s2, 0
304 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
305 ; SI-NEXT: v_mov_b32_e32 v1, 0
306 ; SI-NEXT: s_waitcnt lgkmcnt(0)
307 ; SI-NEXT: v_mov_b32_e32 v2, s5
308 ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
309 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
310 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
313 ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
315 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
316 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c
317 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
318 ; VI-NEXT: s_waitcnt lgkmcnt(0)
319 ; VI-NEXT: v_mov_b32_e32 v1, s1
320 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
321 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
322 ; VI-NEXT: v_mov_b32_e32 v2, s3
323 ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
324 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
325 ; VI-NEXT: flat_store_dword v[0:1], v2
328 ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
330 ; GFX10-NEXT: s_clause 0x1
331 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
332 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
333 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
334 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
335 ; GFX10-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0
336 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5]
337 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
338 ; GFX10-NEXT: s_endpgm
340 ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
342 ; GFX11-NEXT: s_clause 0x1
343 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c
344 ; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
345 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
346 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
347 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
348 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
349 ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0
350 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5]
351 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
352 ; GFX11-NEXT: s_endpgm
353 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
354 %tid.ext = sext i32 %tid to i64
355 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
356 %setcc = fcmp one float %x, 0.0
357 %select = select i1 %setcc, float 0.0, float %z
358 store float %select, ptr addrspace(1) %out.gep
362 define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
363 ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
365 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
366 ; SI-NEXT: s_load_dword s4, s[4:5], 0xb
367 ; SI-NEXT: s_mov_b32 s3, 0xf000
368 ; SI-NEXT: s_mov_b32 s2, 0
369 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
370 ; SI-NEXT: v_mov_b32_e32 v1, 0
371 ; SI-NEXT: s_waitcnt lgkmcnt(0)
372 ; SI-NEXT: v_mov_b32_e32 v2, s4
373 ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
374 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
375 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
378 ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
380 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
381 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
382 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
383 ; VI-NEXT: s_waitcnt lgkmcnt(0)
384 ; VI-NEXT: v_mov_b32_e32 v1, s1
385 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
386 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
387 ; VI-NEXT: v_mov_b32_e32 v2, s2
388 ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
389 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
390 ; VI-NEXT: flat_store_dword v[0:1], v2
393 ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
395 ; GFX10-NEXT: s_clause 0x1
396 ; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c
397 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
398 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
399 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
400 ; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0
401 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s6, s[2:3]
402 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
403 ; GFX10-NEXT: s_endpgm
405 ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
407 ; GFX11-NEXT: s_clause 0x1
408 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
409 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
410 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
411 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
412 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
413 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
414 ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0
415 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s6, s[2:3]
416 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
417 ; GFX11-NEXT: s_endpgm
418 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
419 %tid.ext = sext i32 %tid to i64
420 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
421 %setcc = fcmp one float %x, 0.0
422 %select = select i1 %setcc, float 0.0, float %x
423 store float %select, ptr addrspace(1) %out.gep
427 define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
428 ; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
430 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
431 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
432 ; SI-NEXT: s_mov_b32 s3, 0xf000
433 ; SI-NEXT: s_mov_b32 s2, 0
434 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
435 ; SI-NEXT: v_mov_b32_e32 v1, 0
436 ; SI-NEXT: s_waitcnt lgkmcnt(0)
437 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
438 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
439 ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s6, 0
440 ; SI-NEXT: s_waitcnt vmcnt(0)
441 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
442 ; SI-NEXT: s_waitcnt lgkmcnt(0)
443 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
446 ; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
448 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
449 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
450 ; VI-NEXT: s_waitcnt lgkmcnt(0)
451 ; VI-NEXT: v_mov_b32_e32 v1, s1
452 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
453 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
454 ; VI-NEXT: flat_load_dword v3, v[0:1]
455 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
456 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
457 ; VI-NEXT: s_waitcnt lgkmcnt(0)
458 ; VI-NEXT: v_mov_b32_e32 v1, s1
459 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
460 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
461 ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
462 ; VI-NEXT: s_waitcnt vmcnt(0)
463 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
464 ; VI-NEXT: flat_store_dword v[0:1], v2
467 ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
469 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
470 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
471 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
472 ; GFX10-NEXT: global_load_dword v1, v0, s[0:1]
473 ; GFX10-NEXT: s_clause 0x1
474 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
475 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
476 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
477 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
478 ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
479 ; GFX10-NEXT: s_waitcnt vmcnt(0)
480 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
481 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
482 ; GFX10-NEXT: s_endpgm
484 ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
486 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
487 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
488 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
489 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
490 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
491 ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
492 ; GFX11-NEXT: s_clause 0x1
493 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
494 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
495 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
496 ; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
497 ; GFX11-NEXT: s_waitcnt vmcnt(0)
498 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
499 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
500 ; GFX11-NEXT: s_endpgm
501 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
502 %tid.ext = sext i32 %tid to i64
503 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
504 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
505 %z = load float, ptr addrspace(1) %z.gep
506 %setcc = fcmp one float %x, 0.0
507 %select = select i1 %setcc, float 0.0, float %z
508 store float %select, ptr addrspace(1) %out.gep
512 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
513 ; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
515 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
516 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
517 ; SI-NEXT: s_mov_b32 s3, 0xf000
518 ; SI-NEXT: s_mov_b32 s2, 0
519 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
520 ; SI-NEXT: v_mov_b32_e32 v1, 0
521 ; SI-NEXT: s_waitcnt lgkmcnt(0)
522 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
523 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
524 ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s6, 0
525 ; SI-NEXT: s_waitcnt vmcnt(0)
526 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
527 ; SI-NEXT: s_waitcnt lgkmcnt(0)
528 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
531 ; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
533 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
534 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
535 ; VI-NEXT: s_waitcnt lgkmcnt(0)
536 ; VI-NEXT: v_mov_b32_e32 v1, s1
537 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
538 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
539 ; VI-NEXT: flat_load_dword v3, v[0:1]
540 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
541 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
542 ; VI-NEXT: s_waitcnt lgkmcnt(0)
543 ; VI-NEXT: v_mov_b32_e32 v1, s1
544 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
545 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
546 ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
547 ; VI-NEXT: s_waitcnt vmcnt(0)
548 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
549 ; VI-NEXT: flat_store_dword v[0:1], v2
552 ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
554 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
555 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
556 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
557 ; GFX10-NEXT: global_load_dword v1, v0, s[0:1]
558 ; GFX10-NEXT: s_clause 0x1
559 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
560 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
561 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
562 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
563 ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
564 ; GFX10-NEXT: s_waitcnt vmcnt(0)
565 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
566 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
567 ; GFX10-NEXT: s_endpgm
569 ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
571 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
572 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
573 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
574 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
575 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
576 ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
577 ; GFX11-NEXT: s_clause 0x1
578 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
579 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
580 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
581 ; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
582 ; GFX11-NEXT: s_waitcnt vmcnt(0)
583 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
584 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
585 ; GFX11-NEXT: s_endpgm
586 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
587 %tid.ext = sext i32 %tid to i64
588 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
589 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
590 %z = load float, ptr addrspace(1) %z.gep
591 %setcc = fcmp one float %x, 0.0
592 %select = select i1 %setcc, float 1.0, float %z
593 store float %select, ptr addrspace(1) %out.gep
597 define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 {
598 ; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
600 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
601 ; SI-NEXT: s_load_dword s8, s[4:5], 0xd
602 ; SI-NEXT: s_mov_b32 s7, 0xf000
603 ; SI-NEXT: s_mov_b32 s6, 0
604 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
605 ; SI-NEXT: v_mov_b32_e32 v1, 0
606 ; SI-NEXT: s_waitcnt lgkmcnt(0)
607 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
608 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
609 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
610 ; SI-NEXT: v_mov_b32_e32 v3, s8
611 ; SI-NEXT: s_waitcnt vmcnt(0)
612 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v2
613 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
614 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
617 ; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
619 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
620 ; VI-NEXT: s_load_dword s4, s[4:5], 0x34
621 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
622 ; VI-NEXT: s_waitcnt lgkmcnt(0)
623 ; VI-NEXT: v_mov_b32_e32 v1, s3
624 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
625 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
626 ; VI-NEXT: flat_load_dword v3, v[0:1]
627 ; VI-NEXT: v_mov_b32_e32 v1, s1
628 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
629 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
630 ; VI-NEXT: v_mov_b32_e32 v4, s4
631 ; VI-NEXT: s_waitcnt vmcnt(0)
632 ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v3
633 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc
634 ; VI-NEXT: flat_store_dword v[0:1], v2
637 ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
639 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
640 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
641 ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34
642 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
643 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
644 ; GFX10-NEXT: s_waitcnt vmcnt(0)
645 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
646 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc
647 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
648 ; GFX10-NEXT: s_endpgm
650 ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
652 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
653 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
654 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34
655 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
656 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
657 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
658 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
659 ; GFX11-NEXT: s_waitcnt vmcnt(0)
660 ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
661 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc
662 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
663 ; GFX11-NEXT: s_endpgm
664 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
665 %tid.ext = sext i32 %tid to i64
666 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
667 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
668 %x = load float, ptr addrspace(1) %x.gep
669 %setcc = fcmp olt float %x, 0.0
670 %select = select i1 %setcc, float 1.0, float %z
671 store float %select, ptr addrspace(1) %out.gep
675 define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
676 ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
678 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
679 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
680 ; SI-NEXT: s_mov_b32 s11, 0xf000
681 ; SI-NEXT: s_mov_b32 s10, 0
682 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
683 ; SI-NEXT: v_mov_b32_e32 v1, 0
684 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
685 ; SI-NEXT: s_waitcnt lgkmcnt(0)
686 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
687 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
688 ; SI-NEXT: s_waitcnt vmcnt(0)
689 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
690 ; SI-NEXT: s_waitcnt vmcnt(0)
691 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
692 ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2
693 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
694 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
697 ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
699 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
700 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
701 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
702 ; VI-NEXT: s_waitcnt lgkmcnt(0)
703 ; VI-NEXT: v_mov_b32_e32 v1, s3
704 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
705 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
706 ; VI-NEXT: v_mov_b32_e32 v3, s5
707 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
708 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
709 ; VI-NEXT: flat_load_dword v5, v[0:1] glc
710 ; VI-NEXT: s_waitcnt vmcnt(0)
711 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
712 ; VI-NEXT: s_waitcnt vmcnt(0)
713 ; VI-NEXT: v_mov_b32_e32 v1, s1
714 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
715 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
716 ; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v5
717 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
718 ; VI-NEXT: flat_store_dword v[0:1], v2
721 ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
723 ; GFX10-NEXT: s_clause 0x1
724 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
725 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
726 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
727 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
728 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
729 ; GFX10-NEXT: s_waitcnt vmcnt(0)
730 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
731 ; GFX10-NEXT: s_waitcnt vmcnt(0)
732 ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
733 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
734 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
735 ; GFX10-NEXT: s_endpgm
737 ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
739 ; GFX11-NEXT: s_clause 0x1
740 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
741 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
742 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
743 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
744 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
745 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
746 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
747 ; GFX11-NEXT: s_waitcnt vmcnt(0)
748 ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
749 ; GFX11-NEXT: s_waitcnt vmcnt(0)
750 ; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
751 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
752 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
753 ; GFX11-NEXT: s_endpgm
754 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
755 %tid.ext = sext i32 %tid to i64
756 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
757 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
758 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
759 %x = load volatile float, ptr addrspace(1) %x.gep
760 %z = load volatile float, ptr addrspace(1) %z.gep
761 %setcc = fcmp ult float %x, 0.0
762 %select = select i1 %setcc, float 1.0, float %z
763 store float %select, ptr addrspace(1) %out.gep
767 define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
768 ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
770 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
771 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
772 ; SI-NEXT: s_mov_b32 s11, 0xf000
773 ; SI-NEXT: s_mov_b32 s10, 0
774 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
775 ; SI-NEXT: v_mov_b32_e32 v1, 0
776 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
777 ; SI-NEXT: s_waitcnt lgkmcnt(0)
778 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
779 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
780 ; SI-NEXT: s_waitcnt vmcnt(0)
781 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
782 ; SI-NEXT: s_waitcnt vmcnt(0)
783 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
784 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
785 ; SI-NEXT: v_cndmask_b32_e32 v2, 2, v3, vcc
786 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
789 ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
791 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
792 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
793 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
794 ; VI-NEXT: s_waitcnt lgkmcnt(0)
795 ; VI-NEXT: v_mov_b32_e32 v1, s3
796 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
797 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
798 ; VI-NEXT: v_mov_b32_e32 v3, s5
799 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
800 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
801 ; VI-NEXT: flat_load_dword v5, v[0:1] glc
802 ; VI-NEXT: s_waitcnt vmcnt(0)
803 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
804 ; VI-NEXT: s_waitcnt vmcnt(0)
805 ; VI-NEXT: v_mov_b32_e32 v1, s1
806 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
807 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
808 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
809 ; VI-NEXT: v_cndmask_b32_e32 v2, 2, v2, vcc
810 ; VI-NEXT: flat_store_dword v[0:1], v2
813 ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
815 ; GFX10-NEXT: s_clause 0x1
816 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
817 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
818 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
819 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
820 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
821 ; GFX10-NEXT: s_waitcnt vmcnt(0)
822 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
823 ; GFX10-NEXT: s_waitcnt vmcnt(0)
824 ; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
825 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc
826 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
827 ; GFX10-NEXT: s_endpgm
829 ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
831 ; GFX11-NEXT: s_clause 0x1
832 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
833 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
834 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
835 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
836 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
837 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
838 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
839 ; GFX11-NEXT: s_waitcnt vmcnt(0)
840 ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
841 ; GFX11-NEXT: s_waitcnt vmcnt(0)
842 ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
843 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc
844 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
845 ; GFX11-NEXT: s_endpgm
846 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
847 %tid.ext = sext i32 %tid to i64
848 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
849 %z.gep = getelementptr inbounds i32, ptr addrspace(1) %z.ptr, i64 %tid.ext
850 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
851 %x = load volatile i32, ptr addrspace(1) %x.gep
852 %z = load volatile i32, ptr addrspace(1) %z.gep
853 %setcc = icmp slt i32 %x, 0
854 %select = select i1 %setcc, i32 2, i32 %z
855 store i32 %select, ptr addrspace(1) %out.gep
859 define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
860 ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
862 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
863 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
864 ; SI-NEXT: s_mov_b32 s11, 0xf000
865 ; SI-NEXT: s_mov_b32 s10, 0
866 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
867 ; SI-NEXT: v_mov_b32_e32 v1, 0
868 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
869 ; SI-NEXT: s_waitcnt lgkmcnt(0)
870 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
871 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 glc
872 ; SI-NEXT: s_waitcnt vmcnt(0)
873 ; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 glc
874 ; SI-NEXT: s_waitcnt vmcnt(0)
875 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
876 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3]
877 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
878 ; SI-NEXT: v_cndmask_b32_e32 v2, 2, v4, vcc
879 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
882 ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
884 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
885 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
886 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
887 ; VI-NEXT: s_waitcnt lgkmcnt(0)
888 ; VI-NEXT: v_mov_b32_e32 v1, s3
889 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
890 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
891 ; VI-NEXT: v_mov_b32_e32 v3, s5
892 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
893 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
894 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
895 ; VI-NEXT: s_waitcnt vmcnt(0)
896 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
897 ; VI-NEXT: s_waitcnt vmcnt(0)
898 ; VI-NEXT: v_mov_b32_e32 v5, s1
899 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
900 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
901 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
902 ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
903 ; VI-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
904 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
907 ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
909 ; GFX10-NEXT: s_clause 0x1
910 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
911 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
912 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
913 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
914 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
915 ; GFX10-NEXT: s_waitcnt vmcnt(0)
916 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] glc dlc
917 ; GFX10-NEXT: s_waitcnt vmcnt(0)
918 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
919 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
920 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
921 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
922 ; GFX10-NEXT: s_endpgm
924 ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
926 ; GFX11-NEXT: s_clause 0x1
927 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
928 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
929 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
930 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
931 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
932 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
933 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] glc dlc
934 ; GFX11-NEXT: s_waitcnt vmcnt(0)
935 ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] glc dlc
936 ; GFX11-NEXT: s_waitcnt vmcnt(0)
937 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
938 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
939 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
940 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
941 ; GFX11-NEXT: s_endpgm
942 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
943 %tid.ext = sext i32 %tid to i64
944 %x.gep = getelementptr inbounds i64, ptr addrspace(1) %x.ptr, i64 %tid.ext
945 %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
946 %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
947 %x = load volatile i64, ptr addrspace(1) %x.gep
948 %z = load volatile i64, ptr addrspace(1) %z.gep
949 %setcc = icmp slt i64 %x, 0
950 %select = select i1 %setcc, i64 2, i64 %z
951 store i64 %select, ptr addrspace(1) %out.gep
955 define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
956 ; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
958 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
959 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
960 ; SI-NEXT: s_mov_b32 s11, 0xf000
961 ; SI-NEXT: s_mov_b32 s10, 0
962 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
963 ; SI-NEXT: v_mov_b32_e32 v2, 0
964 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
965 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
966 ; SI-NEXT: v_mov_b32_e32 v5, v2
967 ; SI-NEXT: s_waitcnt lgkmcnt(0)
968 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
969 ; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
970 ; SI-NEXT: s_waitcnt vmcnt(0)
971 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc
972 ; SI-NEXT: s_waitcnt vmcnt(0)
973 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
974 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6
975 ; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
976 ; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
977 ; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
978 ; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
979 ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
982 ; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
984 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
985 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
986 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
987 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
988 ; VI-NEXT: s_waitcnt lgkmcnt(0)
989 ; VI-NEXT: v_mov_b32_e32 v2, s3
990 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
991 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
992 ; VI-NEXT: v_mov_b32_e32 v0, s5
993 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5
994 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
995 ; VI-NEXT: flat_load_dword v6, v[1:2] glc
996 ; VI-NEXT: s_waitcnt vmcnt(0)
997 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc
998 ; VI-NEXT: s_waitcnt vmcnt(0)
999 ; VI-NEXT: v_mov_b32_e32 v7, s1
1000 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5
1001 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
1002 ; VI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6
1003 ; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1004 ; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1005 ; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1006 ; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1007 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1010 ; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
1012 ; GFX10-NEXT: s_clause 0x1
1013 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1014 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1015 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1016 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0
1017 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1018 ; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc
1019 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1020 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc
1021 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1022 ; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6
1023 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1024 ; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1025 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1026 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1027 ; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
1028 ; GFX10-NEXT: s_endpgm
1030 ; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
1032 ; GFX11-NEXT: s_clause 0x1
1033 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1034 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1035 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1036 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1037 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1038 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1039 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1040 ; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc
1041 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1042 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc
1043 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1044 ; GFX11-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5
1045 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1046 ; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1047 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1048 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1049 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
1050 ; GFX11-NEXT: s_endpgm
1051 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1052 %tid.ext = sext i32 %tid to i64
1053 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1054 %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
1055 %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
1056 %x = load volatile float, ptr addrspace(1) %x.gep
1057 %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
1058 %setcc = fcmp ugt float %x, 4.0
1059 %select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>
1060 store <4 x float> %select, ptr addrspace(1) %out.gep
1064 define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1065 ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
1067 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1068 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1069 ; SI-NEXT: s_mov_b32 s11, 0xf000
1070 ; SI-NEXT: s_mov_b32 s10, 0
1071 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1072 ; SI-NEXT: v_mov_b32_e32 v2, 0
1073 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1074 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
1075 ; SI-NEXT: v_mov_b32_e32 v5, v2
1076 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1077 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1078 ; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
1079 ; SI-NEXT: s_waitcnt vmcnt(0)
1080 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc
1081 ; SI-NEXT: s_waitcnt vmcnt(0)
1082 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1083 ; SI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6
1084 ; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1085 ; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1086 ; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1087 ; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1088 ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
1091 ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
1093 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1094 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1095 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1096 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
1097 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1098 ; VI-NEXT: v_mov_b32_e32 v2, s3
1099 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
1100 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1101 ; VI-NEXT: v_mov_b32_e32 v0, s5
1102 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5
1103 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
1104 ; VI-NEXT: flat_load_dword v6, v[1:2] glc
1105 ; VI-NEXT: s_waitcnt vmcnt(0)
1106 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc
1107 ; VI-NEXT: s_waitcnt vmcnt(0)
1108 ; VI-NEXT: v_mov_b32_e32 v7, s1
1109 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5
1110 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
1111 ; VI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6
1112 ; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1113 ; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1114 ; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1115 ; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1116 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1119 ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
1121 ; GFX10-NEXT: s_clause 0x1
1122 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1123 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1124 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1125 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0
1126 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1127 ; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc
1128 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1129 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc
1130 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1131 ; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6
1132 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1133 ; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1134 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1135 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1136 ; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
1137 ; GFX10-NEXT: s_endpgm
1139 ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
1141 ; GFX11-NEXT: s_clause 0x1
1142 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1143 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1144 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1145 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1146 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1147 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1148 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1149 ; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc
1150 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1151 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc
1152 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1153 ; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5
1154 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1155 ; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1156 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1157 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1158 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
1159 ; GFX11-NEXT: s_endpgm
1160 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1161 %tid.ext = sext i32 %tid to i64
1162 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1163 %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
1164 %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
1165 %x = load volatile float, ptr addrspace(1) %x.gep
1166 %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
1167 %setcc = fcmp ugt float %x, 4.0
1168 %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
1169 store <4 x float> %select, ptr addrspace(1) %out.gep
1173 ; This must be swapped as a vector type before the condition has
1175 define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1176 ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
1178 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1179 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1180 ; SI-NEXT: s_mov_b32 s11, 0xf000
1181 ; SI-NEXT: s_mov_b32 s10, 0
1182 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1183 ; SI-NEXT: v_mov_b32_e32 v2, 0
1184 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1185 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
1186 ; SI-NEXT: v_mov_b32_e32 v5, v2
1187 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1188 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1189 ; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
1190 ; SI-NEXT: s_waitcnt vmcnt(0)
1191 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc
1192 ; SI-NEXT: s_waitcnt vmcnt(0)
1193 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1194 ; SI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6
1195 ; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1196 ; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1197 ; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1198 ; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1199 ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
1202 ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
1204 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1205 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1206 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1207 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
1208 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1209 ; VI-NEXT: v_mov_b32_e32 v2, s3
1210 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
1211 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1212 ; VI-NEXT: v_mov_b32_e32 v0, s5
1213 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5
1214 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
1215 ; VI-NEXT: flat_load_dword v6, v[1:2] glc
1216 ; VI-NEXT: s_waitcnt vmcnt(0)
1217 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc
1218 ; VI-NEXT: s_waitcnt vmcnt(0)
1219 ; VI-NEXT: v_mov_b32_e32 v7, s1
1220 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5
1221 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
1222 ; VI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6
1223 ; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1224 ; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1225 ; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1226 ; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1227 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1230 ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
1232 ; GFX10-NEXT: s_clause 0x1
1233 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1234 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1235 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1236 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0
1237 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1238 ; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc
1239 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1240 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc
1241 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1242 ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6
1243 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1244 ; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1245 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1246 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1247 ; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
1248 ; GFX10-NEXT: s_endpgm
1250 ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
1252 ; GFX11-NEXT: s_clause 0x1
1253 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1254 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1255 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1256 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1257 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1258 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1259 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1260 ; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc
1261 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1262 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc
1263 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1264 ; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5
1265 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1266 ; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1267 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1268 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1269 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
1270 ; GFX11-NEXT: s_endpgm
1271 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1272 %tid.ext = sext i32 %tid to i64
1273 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1274 %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
1275 %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
1276 %x = load volatile float, ptr addrspace(1) %x.gep
1277 %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
1278 %setcc = fcmp ugt float 4.0, %x
1279 %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
1280 store <4 x float> %select, ptr addrspace(1) %out.gep
1284 define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1285 ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
1287 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
1288 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
1289 ; SI-NEXT: s_mov_b32 s6, 0
1290 ; SI-NEXT: v_mov_b32_e32 v1, 0
1291 ; SI-NEXT: s_mov_b32 s7, 0xf000
1292 ; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1293 ; SI-NEXT: v_mov_b32_e32 v3, v1
1294 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1295 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1296 ; SI-NEXT: s_mov_b64 s[4:5], s[10:11]
1297 ; SI-NEXT: buffer_load_dword v2, v[2:3], s[4:7], 0 addr64 glc
1298 ; SI-NEXT: s_waitcnt vmcnt(0)
1299 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 glc
1300 ; SI-NEXT: s_waitcnt vmcnt(0)
1301 ; SI-NEXT: s_mov_b64 s[10:11], s[6:7]
1302 ; SI-NEXT: v_and_b32_e32 v3, 1, v3
1303 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
1304 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3
1305 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1306 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1307 ; SI-NEXT: buffer_store_byte v2, v[0:1], s[8:11], 0 addr64
1310 ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
1312 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1313 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1314 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1315 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1316 ; VI-NEXT: v_mov_b32_e32 v2, s3
1317 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
1318 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1319 ; VI-NEXT: v_mov_b32_e32 v4, s5
1320 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v0
1321 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
1322 ; VI-NEXT: flat_load_dword v2, v[1:2] glc
1323 ; VI-NEXT: s_waitcnt vmcnt(0)
1324 ; VI-NEXT: flat_load_ubyte v3, v[3:4] glc
1325 ; VI-NEXT: s_waitcnt vmcnt(0)
1326 ; VI-NEXT: v_mov_b32_e32 v1, s1
1327 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1328 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1329 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
1330 ; VI-NEXT: v_and_b32_e32 v3, 1, v3
1331 ; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3
1332 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1333 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1334 ; VI-NEXT: flat_store_byte v[0:1], v2
1337 ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
1339 ; GFX10-NEXT: s_clause 0x1
1340 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
1341 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
1342 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1343 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1344 ; GFX10-NEXT: global_load_dword v2, v1, s[10:11] glc dlc
1345 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1346 ; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] glc dlc
1347 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1348 ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
1349 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v3
1350 ; GFX10-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
1351 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1352 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
1353 ; GFX10-NEXT: global_store_byte v0, v1, s[8:9]
1354 ; GFX10-NEXT: s_endpgm
1356 ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
1358 ; GFX11-NEXT: s_clause 0x1
1359 ; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
1360 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
1361 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1362 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1363 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1364 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1365 ; GFX11-NEXT: global_load_b32 v1, v1, s[10:11] glc dlc
1366 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1367 ; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] glc dlc
1368 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1369 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
1370 ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
1371 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1372 ; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2
1373 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1374 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1375 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
1376 ; GFX11-NEXT: global_store_b8 v0, v1, s[8:9]
1377 ; GFX11-NEXT: s_endpgm
1378 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1379 %tid.ext = sext i32 %tid to i64
1380 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
1381 %z.gep = getelementptr inbounds i1, ptr addrspace(1) %z.ptr, i64 %tid.ext
1382 %out.gep = getelementptr inbounds i1, ptr addrspace(1) %out, i64 %tid.ext
1383 %x = load volatile i32, ptr addrspace(1) %x.gep
1384 %z = load volatile i1, ptr addrspace(1) %z.gep
1385 %setcc = icmp slt i32 %x, 0
1386 %select = select i1 %setcc, i1 true, i1 %z
1387 store i1 %select, ptr addrspace(1) %out.gep
1391 ; Different types compared vs. selected
1392 define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1393 ; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
1395 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1396 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1397 ; SI-NEXT: s_mov_b32 s11, 0xf000
1398 ; SI-NEXT: s_mov_b32 s10, 0
1399 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1400 ; SI-NEXT: v_mov_b32_e32 v2, 0
1401 ; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
1402 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
1403 ; SI-NEXT: v_mov_b32_e32 v4, v2
1404 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1405 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1406 ; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
1407 ; SI-NEXT: s_waitcnt vmcnt(0)
1408 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc
1409 ; SI-NEXT: s_waitcnt vmcnt(0)
1410 ; SI-NEXT: v_mov_b32_e32 v5, 0x3ff00000
1411 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1412 ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2
1413 ; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
1414 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1415 ; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64
1418 ; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
1420 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1421 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1422 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1423 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
1424 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1425 ; VI-NEXT: v_mov_b32_e32 v2, s3
1426 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
1427 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1428 ; VI-NEXT: v_mov_b32_e32 v0, s5
1429 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5
1430 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
1431 ; VI-NEXT: flat_load_dword v6, v[1:2] glc
1432 ; VI-NEXT: s_waitcnt vmcnt(0)
1433 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc
1434 ; VI-NEXT: s_waitcnt vmcnt(0)
1435 ; VI-NEXT: v_mov_b32_e32 v3, s1
1436 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v5
1437 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1438 ; VI-NEXT: v_mov_b32_e32 v4, 0x3ff00000
1439 ; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v6
1440 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
1441 ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1442 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1445 ; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
1447 ; GFX10-NEXT: s_clause 0x1
1448 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1449 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1450 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1451 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
1452 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1453 ; GFX10-NEXT: global_load_dword v4, v2, s[2:3] glc dlc
1454 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1455 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] glc dlc
1456 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1457 ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v4
1458 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
1459 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1460 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
1461 ; GFX10-NEXT: s_endpgm
1463 ; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
1465 ; GFX11-NEXT: s_clause 0x1
1466 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1467 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1468 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1469 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1470 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1471 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1472 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1473 ; GFX11-NEXT: global_load_b32 v3, v1, s[2:3] glc dlc
1474 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1475 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[4:5] glc dlc
1476 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1477 ; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v3
1478 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
1479 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1480 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1481 ; GFX11-NEXT: s_endpgm
1482 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1483 %tid.ext = sext i32 %tid to i64
1484 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1485 %z.gep = getelementptr inbounds double, ptr addrspace(1) %z.ptr, i64 %tid.ext
1486 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
1487 %x = load volatile float, ptr addrspace(1) %x.gep
1488 %z = load volatile double, ptr addrspace(1) %z.gep
1489 %setcc = fcmp ult float %x, 0.0
1490 %select = select i1 %setcc, double 1.0, double %z
1491 store double %select, ptr addrspace(1) %out.gep
1495 ; Different types compared vs. selected
1496 define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1497 ; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
1499 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1500 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1501 ; SI-NEXT: s_mov_b32 s11, 0xf000
1502 ; SI-NEXT: s_mov_b32 s10, 0
1503 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1504 ; SI-NEXT: v_mov_b32_e32 v2, 0
1505 ; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
1506 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
1507 ; SI-NEXT: v_mov_b32_e32 v4, v2
1508 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1509 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1510 ; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
1511 ; SI-NEXT: s_waitcnt vmcnt(0)
1512 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc
1513 ; SI-NEXT: s_waitcnt vmcnt(0)
1514 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1515 ; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2
1516 ; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
1517 ; SI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
1518 ; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64
1521 ; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
1523 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1524 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1525 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1526 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
1527 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1528 ; VI-NEXT: v_mov_b32_e32 v2, s3
1529 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
1530 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1531 ; VI-NEXT: v_mov_b32_e32 v0, s5
1532 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5
1533 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
1534 ; VI-NEXT: flat_load_dword v6, v[1:2] glc
1535 ; VI-NEXT: s_waitcnt vmcnt(0)
1536 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc
1537 ; VI-NEXT: s_waitcnt vmcnt(0)
1538 ; VI-NEXT: v_mov_b32_e32 v3, s1
1539 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v5
1540 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1541 ; VI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v6
1542 ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
1543 ; VI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
1544 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1547 ; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
1549 ; GFX10-NEXT: s_clause 0x1
1550 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1551 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1552 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1553 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
1554 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1555 ; GFX10-NEXT: global_load_dword v4, v2, s[2:3] glc dlc
1556 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1557 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] glc dlc
1558 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1559 ; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4
1560 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
1561 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
1562 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
1563 ; GFX10-NEXT: s_endpgm
1565 ; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
1567 ; GFX11-NEXT: s_clause 0x1
1568 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1569 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1570 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1571 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1572 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1573 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1574 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1575 ; GFX11-NEXT: global_load_b32 v3, v1, s[2:3] glc dlc
1576 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1577 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[4:5] glc dlc
1578 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1579 ; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3
1580 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
1581 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
1582 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1583 ; GFX11-NEXT: s_endpgm
1584 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1585 %tid.ext = sext i32 %tid to i64
1586 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1587 %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
1588 %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
1589 %x = load volatile float, ptr addrspace(1) %x.gep
1590 %z = load volatile i64, ptr addrspace(1) %z.gep
1591 %setcc = fcmp one float %x, 0.0
1592 %select = select i1 %setcc, i64 3, i64 %z
1593 store i64 %select, ptr addrspace(1) %out.gep
1597 ; Different types compared vs. selected
1598 define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1599 ; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
1601 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1602 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1603 ; SI-NEXT: s_mov_b32 s11, 0xf000
1604 ; SI-NEXT: s_mov_b32 s10, 0
1605 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1606 ; SI-NEXT: v_mov_b32_e32 v1, 0
1607 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
1608 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1609 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1610 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
1611 ; SI-NEXT: s_waitcnt vmcnt(0)
1612 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
1613 ; SI-NEXT: s_waitcnt vmcnt(0)
1614 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1615 ; SI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2
1616 ; SI-NEXT: v_cndmask_b32_e32 v2, 4.0, v3, vcc
1617 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1620 ; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
1622 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1623 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1624 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1625 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1626 ; VI-NEXT: v_mov_b32_e32 v1, s3
1627 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1628 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1629 ; VI-NEXT: v_mov_b32_e32 v3, s5
1630 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1631 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1632 ; VI-NEXT: flat_load_dword v5, v[0:1] glc
1633 ; VI-NEXT: s_waitcnt vmcnt(0)
1634 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
1635 ; VI-NEXT: s_waitcnt vmcnt(0)
1636 ; VI-NEXT: v_mov_b32_e32 v1, s1
1637 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
1638 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1639 ; VI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v5
1640 ; VI-NEXT: v_cndmask_b32_e32 v2, 4.0, v2, vcc
1641 ; VI-NEXT: flat_store_dword v[0:1], v2
1644 ; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
1646 ; GFX10-NEXT: s_clause 0x1
1647 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1648 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1649 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1650 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1651 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
1652 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1653 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
1654 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1655 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1
1656 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc
1657 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1658 ; GFX10-NEXT: s_endpgm
1660 ; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
1662 ; GFX11-NEXT: s_clause 0x1
1663 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1664 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1665 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1666 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1667 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1668 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1669 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
1670 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1671 ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
1672 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1673 ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1
1674 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc
1675 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1676 ; GFX11-NEXT: s_endpgm
1677 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1678 %tid.ext = sext i32 %tid to i64
1679 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
1680 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
1681 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1682 %x = load volatile i32, ptr addrspace(1) %x.gep
1683 %z = load volatile float, ptr addrspace(1) %z.gep
1684 %setcc = icmp ugt i32 %x, 1
1685 %select = select i1 %setcc, float 4.0, float %z
1686 store float %select, ptr addrspace(1) %out.gep
1690 ; FIXME: Should be able to handle multiple uses
1691 define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1692 ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
1694 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1695 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1696 ; SI-NEXT: s_mov_b32 s11, 0xf000
1697 ; SI-NEXT: s_mov_b32 s10, 0
1698 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1699 ; SI-NEXT: v_mov_b32_e32 v1, 0
1700 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
1701 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1702 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1703 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
1704 ; SI-NEXT: s_waitcnt vmcnt(0)
1705 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
1706 ; SI-NEXT: s_waitcnt vmcnt(0)
1707 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1708 ; SI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v2
1709 ; SI-NEXT: v_cndmask_b32_e64 v2, v3, -1.0, vcc
1710 ; SI-NEXT: v_cndmask_b32_e64 v3, v3, -2.0, vcc
1711 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1712 ; SI-NEXT: s_waitcnt vmcnt(0)
1713 ; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
1714 ; SI-NEXT: s_waitcnt vmcnt(0)
1717 ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
1719 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1720 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1721 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1722 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1723 ; VI-NEXT: v_mov_b32_e32 v1, s3
1724 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1725 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1726 ; VI-NEXT: v_mov_b32_e32 v3, s5
1727 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1728 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1729 ; VI-NEXT: flat_load_dword v5, v[0:1] glc
1730 ; VI-NEXT: s_waitcnt vmcnt(0)
1731 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
1732 ; VI-NEXT: s_waitcnt vmcnt(0)
1733 ; VI-NEXT: v_mov_b32_e32 v1, s1
1734 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
1735 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1736 ; VI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v5
1737 ; VI-NEXT: v_cndmask_b32_e64 v3, v2, -1.0, vcc
1738 ; VI-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc
1739 ; VI-NEXT: flat_store_dword v[0:1], v3
1740 ; VI-NEXT: s_waitcnt vmcnt(0)
1741 ; VI-NEXT: flat_store_dword v[0:1], v2
1742 ; VI-NEXT: s_waitcnt vmcnt(0)
1745 ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
1747 ; GFX10-NEXT: s_clause 0x1
1748 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1749 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1750 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1751 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1752 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
1753 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1754 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
1755 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1756 ; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1
1757 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc
1758 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc
1759 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1760 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1761 ; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
1762 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1763 ; GFX10-NEXT: s_endpgm
1765 ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
1767 ; GFX11-NEXT: s_clause 0x1
1768 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1769 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1770 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1771 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1772 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1773 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1774 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
1775 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1776 ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
1777 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1778 ; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1
1779 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc
1780 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc
1781 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
1782 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1783 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc
1784 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1785 ; GFX11-NEXT: s_endpgm
1786 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1787 %tid.ext = sext i32 %tid to i64
1788 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1789 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
1790 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1791 %x = load volatile float, ptr addrspace(1) %x.gep
1792 %z = load volatile float, ptr addrspace(1) %z.gep
1793 %setcc = fcmp ugt float 4.0, %x
1794 %select0 = select i1 %setcc, float -1.0, float %z
1795 %select1 = select i1 %setcc, float -2.0, float %z
1796 store volatile float %select0, ptr addrspace(1) %out.gep
1797 store volatile float %select1, ptr addrspace(1) %out.gep
1801 ; Source modifiers abs/neg only work for f32
1802 define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
1803 ; SI-LABEL: v_cndmask_abs_neg_f16:
1805 ; SI-NEXT: s_load_dword s8, s[4:5], 0xb
1806 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
1807 ; SI-NEXT: s_mov_b32 s7, 0xf000
1808 ; SI-NEXT: s_mov_b32 s2, 0
1809 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1810 ; SI-NEXT: v_mov_b32_e32 v1, 0
1811 ; SI-NEXT: s_mov_b32 s3, s7
1812 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1813 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
1814 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
1815 ; SI-NEXT: s_mov_b32 s6, -1
1816 ; SI-NEXT: s_cmp_lg_u32 s8, 0
1817 ; SI-NEXT: s_waitcnt vmcnt(0)
1818 ; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0|
1819 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
1820 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1821 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1822 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1823 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1824 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
1827 ; VI-LABEL: v_cndmask_abs_neg_f16:
1829 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
1830 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1831 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1832 ; VI-NEXT: v_mov_b32_e32 v1, s1
1833 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1834 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1835 ; VI-NEXT: flat_load_ushort v0, v[0:1]
1836 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
1837 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1838 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1839 ; VI-NEXT: s_cmp_lg_u32 s2, 0
1840 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1841 ; VI-NEXT: s_waitcnt vmcnt(0)
1842 ; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0
1843 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1844 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
1845 ; VI-NEXT: v_mov_b32_e32 v0, s0
1846 ; VI-NEXT: v_mov_b32_e32 v1, s1
1847 ; VI-NEXT: flat_store_short v[0:1], v2
1850 ; GFX10-LABEL: v_cndmask_abs_neg_f16:
1852 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
1853 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1854 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1855 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1856 ; GFX10-NEXT: global_load_ushort v0, v0, s[0:1]
1857 ; GFX10-NEXT: s_clause 0x1
1858 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
1859 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
1860 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1861 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1862 ; GFX10-NEXT: s_cmp_lg_u32 s2, 0
1863 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0
1864 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1865 ; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff, v0
1866 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1867 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1868 ; GFX10-NEXT: global_store_short v2, v0, s[0:1]
1869 ; GFX10-NEXT: s_endpgm
1871 ; GFX11-LABEL: v_cndmask_abs_neg_f16:
1873 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
1874 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1875 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1876 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1877 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1878 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1879 ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
1880 ; GFX11-NEXT: s_clause 0x1
1881 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
1882 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
1883 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1884 ; GFX11-NEXT: s_cmp_lg_u32 s2, 0
1885 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
1886 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1887 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0
1888 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1889 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1890 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1891 ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
1892 ; GFX11-NEXT: s_endpgm
1893 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
1894 %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx
1895 %f = load half, ptr addrspace(1) %f.gep
1896 %f.abs = call half @llvm.fabs.f16(half %f)
1897 %f.neg = fneg half %f
1898 %setcc = icmp ne i32 %c, 0
1899 %select = select i1 %setcc, half %f.abs, half %f.neg
1900 store half %select, ptr addrspace(1) %out
1904 define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
1905 ; SI-LABEL: v_cndmask_abs_neg_f32:
1907 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
1908 ; SI-NEXT: s_load_dword s8, s[4:5], 0xb
1909 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1910 ; SI-NEXT: s_mov_b32 s3, 0xf000
1911 ; SI-NEXT: s_mov_b32 s6, 0
1912 ; SI-NEXT: s_mov_b32 s7, s3
1913 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1914 ; SI-NEXT: v_mov_b32_e32 v1, 0
1915 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1916 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1917 ; SI-NEXT: s_mov_b32 s2, -1
1918 ; SI-NEXT: s_cmp_lg_u32 s8, 0
1919 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
1920 ; SI-NEXT: s_waitcnt vmcnt(0)
1921 ; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[4:5]
1922 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1925 ; VI-LABEL: v_cndmask_abs_neg_f32:
1927 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
1928 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1929 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1930 ; VI-NEXT: v_mov_b32_e32 v1, s1
1931 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1932 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1933 ; VI-NEXT: flat_load_dword v0, v[0:1]
1934 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
1935 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1936 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1937 ; VI-NEXT: s_cmp_lg_u32 s2, 0
1938 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
1939 ; VI-NEXT: s_waitcnt vmcnt(0)
1940 ; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3]
1941 ; VI-NEXT: v_mov_b32_e32 v0, s0
1942 ; VI-NEXT: v_mov_b32_e32 v1, s1
1943 ; VI-NEXT: flat_store_dword v[0:1], v2
1946 ; GFX10-LABEL: v_cndmask_abs_neg_f32:
1948 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
1949 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1950 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1951 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1952 ; GFX10-NEXT: global_load_dword v0, v0, s[0:1]
1953 ; GFX10-NEXT: s_clause 0x1
1954 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
1955 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
1956 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1957 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1958 ; GFX10-NEXT: s_cmp_lg_u32 s2, 0
1959 ; GFX10-NEXT: s_cselect_b64 s[2:3], -1, 0
1960 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1961 ; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
1962 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1963 ; GFX10-NEXT: s_endpgm
1965 ; GFX11-LABEL: v_cndmask_abs_neg_f32:
1967 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
1968 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1969 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1970 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1971 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1972 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1973 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
1974 ; GFX11-NEXT: s_clause 0x1
1975 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
1976 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
1977 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1978 ; GFX11-NEXT: s_cmp_lg_u32 s2, 0
1979 ; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0
1980 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1981 ; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
1982 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1983 ; GFX11-NEXT: s_endpgm
1984 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
1985 %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
1986 %f = load float, ptr addrspace(1) %f.gep
1987 %f.abs = call float @llvm.fabs.f32(float %f)
1988 %f.neg = fneg float %f
1989 %setcc = icmp ne i32 %c, 0
1990 %select = select i1 %setcc, float %f.abs, float %f.neg
1991 store float %select, ptr addrspace(1) %out
1995 define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
1996 ; SI-LABEL: v_cndmask_abs_neg_f64:
1998 ; SI-NEXT: s_load_dword s8, s[4:5], 0xb
1999 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
2000 ; SI-NEXT: s_mov_b32 s7, 0xf000
2001 ; SI-NEXT: s_mov_b32 s2, 0
2002 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
2003 ; SI-NEXT: v_mov_b32_e32 v1, 0
2004 ; SI-NEXT: s_mov_b32 s3, s7
2005 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2006 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
2007 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
2008 ; SI-NEXT: s_mov_b32 s6, -1
2009 ; SI-NEXT: s_cmp_lg_u32 s8, 0
2010 ; SI-NEXT: s_waitcnt vmcnt(0)
2011 ; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
2012 ; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
2013 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
2014 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2015 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
2016 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2017 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2020 ; VI-LABEL: v_cndmask_abs_neg_f64:
2022 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
2023 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
2024 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2025 ; VI-NEXT: v_mov_b32_e32 v1, s1
2026 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2027 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2028 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2029 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
2030 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2031 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2032 ; VI-NEXT: s_cmp_lg_u32 s2, 0
2033 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2034 ; VI-NEXT: s_waitcnt vmcnt(0)
2035 ; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
2036 ; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
2037 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2038 ; VI-NEXT: v_mov_b32_e32 v3, s1
2039 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
2040 ; VI-NEXT: v_mov_b32_e32 v2, s0
2041 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2044 ; GFX10-LABEL: v_cndmask_abs_neg_f64:
2046 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
2047 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
2048 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
2049 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2050 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
2051 ; GFX10-NEXT: s_clause 0x1
2052 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
2053 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
2054 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2055 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2056 ; GFX10-NEXT: s_cmp_lg_u32 s2, 0
2057 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0
2058 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2059 ; GFX10-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
2060 ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
2061 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
2062 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2063 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
2064 ; GFX10-NEXT: s_endpgm
2066 ; GFX11-LABEL: v_cndmask_abs_neg_f64:
2068 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
2069 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2070 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
2071 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2072 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
2073 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2074 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
2075 ; GFX11-NEXT: s_clause 0x1
2076 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
2077 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
2078 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2079 ; GFX11-NEXT: s_cmp_lg_u32 s2, 0
2080 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
2081 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2082 ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
2083 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
2084 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
2085 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2086 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2087 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
2088 ; GFX11-NEXT: s_endpgm
2089 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
2090 %f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx
2091 %f = load double, ptr addrspace(1) %f.gep
2092 %f.abs = call double @llvm.fabs.f64(double %f)
2093 %f.neg = fneg double %f
2094 %setcc = icmp ne i32 %c, 0
2095 %select = select i1 %setcc, double %f.abs, double %f.neg
2096 store double %select, ptr addrspace(1) %out
2100 attributes #0 = { nounwind }
2101 attributes #1 = { nounwind readnone }