1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
3 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7 define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
8 ; GFX6-LABEL: v_clamp_f32:
10 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
11 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
12 ; GFX6-NEXT: s_mov_b32 s6, 0
13 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
14 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
15 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
16 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
17 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
18 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
19 ; GFX6-NEXT: s_waitcnt vmcnt(0)
20 ; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp
21 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
24 ; GFX8-LABEL: v_clamp_f32:
26 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
27 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
28 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
29 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
30 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
31 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
32 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
33 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
34 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
35 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
36 ; GFX8-NEXT: s_waitcnt vmcnt(0)
37 ; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
38 ; GFX8-NEXT: flat_store_dword v[0:1], v2
41 ; GFX9-LABEL: v_clamp_f32:
43 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
44 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
45 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
46 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
47 ; GFX9-NEXT: s_waitcnt vmcnt(0)
48 ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
49 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
52 ; GFX11-LABEL: v_clamp_f32:
54 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
55 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
56 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
57 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
58 ; GFX11-NEXT: s_waitcnt vmcnt(0)
59 ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
60 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
62 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
63 ; GFX11-NEXT: s_endpgm
64 %tid = call i32 @llvm.amdgcn.workitem.id.x()
65 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
66 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
67 %a = load float, ptr addrspace(1) %gep0
68 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
69 %med = call float @llvm.minnum.f32(float %max, float 1.0)
71 store float %med, ptr addrspace(1) %out.gep
75 define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
76 ; GFX6-LABEL: v_clamp_neg_f32:
78 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
79 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
80 ; GFX6-NEXT: s_mov_b32 s6, 0
81 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
82 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
83 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
84 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
85 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
86 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
87 ; GFX6-NEXT: s_waitcnt vmcnt(0)
88 ; GFX6-NEXT: v_max_f32_e64 v2, -v2, -v2 clamp
89 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
92 ; GFX8-LABEL: v_clamp_neg_f32:
94 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
95 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
96 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
97 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
98 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
99 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
100 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
101 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
102 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
103 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
104 ; GFX8-NEXT: s_waitcnt vmcnt(0)
105 ; GFX8-NEXT: v_max_f32_e64 v2, -v3, -v3 clamp
106 ; GFX8-NEXT: flat_store_dword v[0:1], v2
107 ; GFX8-NEXT: s_endpgm
109 ; GFX9-LABEL: v_clamp_neg_f32:
111 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
112 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
113 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
114 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
115 ; GFX9-NEXT: s_waitcnt vmcnt(0)
116 ; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
117 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
118 ; GFX9-NEXT: s_endpgm
120 ; GFX11-LABEL: v_clamp_neg_f32:
122 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
123 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
124 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
125 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
126 ; GFX11-NEXT: s_waitcnt vmcnt(0)
127 ; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
128 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
129 ; GFX11-NEXT: s_nop 0
130 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
131 ; GFX11-NEXT: s_endpgm
132 %tid = call i32 @llvm.amdgcn.workitem.id.x()
133 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
134 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
135 %a = load float, ptr addrspace(1) %gep0
136 %fneg.a = fneg float %a
137 %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
138 %med = call float @llvm.minnum.f32(float %max, float 1.0)
140 store float %med, ptr addrspace(1) %out.gep
144 define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
145 ; GFX6-LABEL: v_clamp_negabs_f32:
147 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
148 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
149 ; GFX6-NEXT: s_mov_b32 s6, 0
150 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
151 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
152 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
153 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
154 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
155 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
156 ; GFX6-NEXT: s_waitcnt vmcnt(0)
157 ; GFX6-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| clamp
158 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
159 ; GFX6-NEXT: s_endpgm
161 ; GFX8-LABEL: v_clamp_negabs_f32:
163 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
164 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
165 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
166 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
167 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
168 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
169 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
170 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
171 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
172 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
173 ; GFX8-NEXT: s_waitcnt vmcnt(0)
174 ; GFX8-NEXT: v_max_f32_e64 v2, -|v3|, -|v3| clamp
175 ; GFX8-NEXT: flat_store_dword v[0:1], v2
176 ; GFX8-NEXT: s_endpgm
178 ; GFX9-LABEL: v_clamp_negabs_f32:
180 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
181 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
182 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
184 ; GFX9-NEXT: s_waitcnt vmcnt(0)
185 ; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp
186 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
187 ; GFX9-NEXT: s_endpgm
189 ; GFX11-LABEL: v_clamp_negabs_f32:
191 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
192 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
193 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
194 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
195 ; GFX11-NEXT: s_waitcnt vmcnt(0)
196 ; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp
197 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
198 ; GFX11-NEXT: s_nop 0
199 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
200 ; GFX11-NEXT: s_endpgm
201 %tid = call i32 @llvm.amdgcn.workitem.id.x()
202 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
203 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
204 %a = load float, ptr addrspace(1) %gep0
205 %fabs.a = call float @llvm.fabs.f32(float %a)
206 %fneg.fabs.a = fneg float %fabs.a
208 %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
209 %med = call float @llvm.minnum.f32(float %max, float 1.0)
211 store float %med, ptr addrspace(1) %out.gep
215 define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
216 ; GFX6-LABEL: v_clamp_negzero_f32:
218 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
219 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
220 ; GFX6-NEXT: s_mov_b32 s6, 0
221 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
222 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
223 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
224 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
225 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
226 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
227 ; GFX6-NEXT: s_waitcnt vmcnt(0)
228 ; GFX6-NEXT: v_add_f32_e32 v2, 0.5, v2
229 ; GFX6-NEXT: v_max_f32_e32 v2, 0x80000000, v2
230 ; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
231 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
232 ; GFX6-NEXT: s_endpgm
234 ; GFX8-LABEL: v_clamp_negzero_f32:
236 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
237 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
238 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
239 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
240 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
241 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
242 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
243 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
244 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
245 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
246 ; GFX8-NEXT: s_waitcnt vmcnt(0)
247 ; GFX8-NEXT: v_add_f32_e32 v2, 0.5, v3
248 ; GFX8-NEXT: v_max_f32_e32 v2, 0x80000000, v2
249 ; GFX8-NEXT: v_min_f32_e32 v2, 1.0, v2
250 ; GFX8-NEXT: flat_store_dword v[0:1], v2
251 ; GFX8-NEXT: s_endpgm
253 ; GFX9-LABEL: v_clamp_negzero_f32:
255 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
256 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
257 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
258 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
259 ; GFX9-NEXT: s_waitcnt vmcnt(0)
260 ; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1
261 ; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1
262 ; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1
263 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
264 ; GFX9-NEXT: s_endpgm
266 ; GFX11-LABEL: v_clamp_negzero_f32:
268 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
269 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
270 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
271 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
272 ; GFX11-NEXT: s_waitcnt vmcnt(0)
273 ; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
274 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
275 ; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0
276 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
277 ; GFX11-NEXT: s_nop 0
278 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
279 ; GFX11-NEXT: s_endpgm
280 %tid = call i32 @llvm.amdgcn.workitem.id.x()
281 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
282 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
283 %a = load float, ptr addrspace(1) %gep0
284 %add = fadd nnan float %a, 0.5
285 %max = call float @llvm.maxnum.f32(float %add, float -0.0)
286 %med = call float @llvm.minnum.f32(float %max, float 1.0)
288 store float %med, ptr addrspace(1) %out.gep
292 ; FIXME: Weird inconsistency in how -0.0 is treated. Accepted if clamp
293 ; matched through med3, not if directly. Is this correct?
294 define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
295 ; GFX6-LABEL: v_clamp_negzero_maybe_snan_f32:
297 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
298 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
299 ; GFX6-NEXT: s_mov_b32 s6, 0
300 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
301 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
302 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
303 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
304 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
305 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
306 ; GFX6-NEXT: s_waitcnt vmcnt(0)
307 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
308 ; GFX6-NEXT: v_max_f32_e32 v2, 0x80000000, v2
309 ; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
310 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
311 ; GFX6-NEXT: s_endpgm
313 ; GFX8-LABEL: v_clamp_negzero_maybe_snan_f32:
315 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
316 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
317 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
318 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
319 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
320 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
321 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
322 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
323 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
324 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
325 ; GFX8-NEXT: s_waitcnt vmcnt(0)
326 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3
327 ; GFX8-NEXT: v_max_f32_e32 v2, 0x80000000, v2
328 ; GFX8-NEXT: v_min_f32_e32 v2, 1.0, v2
329 ; GFX8-NEXT: flat_store_dword v[0:1], v2
330 ; GFX8-NEXT: s_endpgm
332 ; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32:
334 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
335 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
336 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
337 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
338 ; GFX9-NEXT: s_waitcnt vmcnt(0)
339 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
340 ; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1
341 ; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1
342 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
343 ; GFX9-NEXT: s_endpgm
345 ; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32:
347 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
348 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
349 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
350 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
351 ; GFX11-NEXT: s_waitcnt vmcnt(0)
352 ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
353 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
354 ; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0
355 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
356 ; GFX11-NEXT: s_nop 0
357 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
358 ; GFX11-NEXT: s_endpgm
359 %tid = call i32 @llvm.amdgcn.workitem.id.x()
360 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
361 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
362 %a = load float, ptr addrspace(1) %gep0
363 %max = call float @llvm.maxnum.f32(float %a, float -0.0)
364 %med = call float @llvm.minnum.f32(float %max, float 1.0)
366 store float %med, ptr addrspace(1) %out.gep
370 define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
371 ; GFX6-LABEL: v_clamp_multi_use_max_f32:
373 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
374 ; GFX6-NEXT: s_mov_b32 s6, 0
375 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
376 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
377 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
378 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
379 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
380 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
381 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
382 ; GFX6-NEXT: s_mov_b32 s6, -1
383 ; GFX6-NEXT: s_waitcnt vmcnt(0)
384 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
385 ; GFX6-NEXT: v_max_f32_e32 v2, 0, v2
386 ; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v2
387 ; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
388 ; GFX6-NEXT: buffer_store_dword v2, off, s[4:7], 0
389 ; GFX6-NEXT: s_waitcnt vmcnt(0)
390 ; GFX6-NEXT: s_endpgm
392 ; GFX8-LABEL: v_clamp_multi_use_max_f32:
394 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
395 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
396 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
397 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
398 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
399 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
400 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
401 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
402 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
403 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
404 ; GFX8-NEXT: s_waitcnt vmcnt(0)
405 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3
406 ; GFX8-NEXT: v_max_f32_e32 v2, 0, v2
407 ; GFX8-NEXT: v_min_f32_e32 v3, 1.0, v2
408 ; GFX8-NEXT: flat_store_dword v[0:1], v3
409 ; GFX8-NEXT: flat_store_dword v[0:1], v2
410 ; GFX8-NEXT: s_waitcnt vmcnt(0)
411 ; GFX8-NEXT: s_endpgm
413 ; GFX9-LABEL: v_clamp_multi_use_max_f32:
415 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
416 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
417 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
418 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
419 ; GFX9-NEXT: s_waitcnt vmcnt(0)
420 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
421 ; GFX9-NEXT: v_max_f32_e32 v1, 0, v1
422 ; GFX9-NEXT: v_min_f32_e32 v2, 1.0, v1
423 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
424 ; GFX9-NEXT: global_store_dword v[0:1], v1, off
425 ; GFX9-NEXT: s_waitcnt vmcnt(0)
426 ; GFX9-NEXT: s_endpgm
428 ; GFX11-LABEL: v_clamp_multi_use_max_f32:
430 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
431 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
432 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
433 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
434 ; GFX11-NEXT: s_waitcnt vmcnt(0)
435 ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
436 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
437 ; GFX11-NEXT: v_max_f32_e32 v1, 0, v1
438 ; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1
439 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
440 ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
441 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
442 ; GFX11-NEXT: s_nop 0
443 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
444 ; GFX11-NEXT: s_endpgm
445 %tid = call i32 @llvm.amdgcn.workitem.id.x()
446 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
447 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
448 %a = load float, ptr addrspace(1) %gep0
449 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
450 %med = call float @llvm.minnum.f32(float %max, float 1.0)
452 store float %med, ptr addrspace(1) %out.gep
453 store volatile float %max, ptr addrspace(1) undef
457 define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
458 ; GFX6-LABEL: v_clamp_f16:
460 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
461 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
462 ; GFX6-NEXT: s_mov_b32 s6, 0
463 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
464 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
465 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
466 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
467 ; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
468 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
469 ; GFX6-NEXT: s_waitcnt vmcnt(0)
470 ; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp
471 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
472 ; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
473 ; GFX6-NEXT: s_endpgm
475 ; GFX8-LABEL: v_clamp_f16:
477 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
478 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
479 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
480 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
481 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
482 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
483 ; GFX8-NEXT: flat_load_ushort v3, v[0:1]
484 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
485 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
486 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
487 ; GFX8-NEXT: s_waitcnt vmcnt(0)
488 ; GFX8-NEXT: v_max_f16_e64 v2, v3, v3 clamp
489 ; GFX8-NEXT: flat_store_short v[0:1], v2
490 ; GFX8-NEXT: s_endpgm
492 ; GFX9-LABEL: v_clamp_f16:
494 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
495 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
496 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
497 ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
498 ; GFX9-NEXT: s_waitcnt vmcnt(0)
499 ; GFX9-NEXT: v_max_f16_e64 v1, v1, v1 clamp
500 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
501 ; GFX9-NEXT: s_endpgm
503 ; GFX11-LABEL: v_clamp_f16:
505 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
506 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
507 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
508 ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
509 ; GFX11-NEXT: s_waitcnt vmcnt(0)
510 ; GFX11-NEXT: v_max_f16_e64 v1, v1, v1 clamp
511 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
512 ; GFX11-NEXT: s_nop 0
513 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
514 ; GFX11-NEXT: s_endpgm
515 %tid = call i32 @llvm.amdgcn.workitem.id.x()
516 %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
517 %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
518 %a = load half, ptr addrspace(1) %gep0
519 %max = call half @llvm.maxnum.f16(half %a, half 0.0)
520 %med = call half @llvm.minnum.f16(half %max, half 1.0)
522 store half %med, ptr addrspace(1) %out.gep
526 define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
527 ; GFX6-LABEL: v_clamp_neg_f16:
529 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
530 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
531 ; GFX6-NEXT: s_mov_b32 s6, 0
532 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
533 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
534 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
535 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
536 ; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
537 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
538 ; GFX6-NEXT: s_waitcnt vmcnt(0)
539 ; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 clamp
540 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
541 ; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
542 ; GFX6-NEXT: s_endpgm
544 ; GFX8-LABEL: v_clamp_neg_f16:
546 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
547 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
548 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
549 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
550 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
551 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
552 ; GFX8-NEXT: flat_load_ushort v3, v[0:1]
553 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
554 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
555 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
556 ; GFX8-NEXT: s_waitcnt vmcnt(0)
557 ; GFX8-NEXT: v_max_f16_e64 v2, -v3, -v3 clamp
558 ; GFX8-NEXT: flat_store_short v[0:1], v2
559 ; GFX8-NEXT: s_endpgm
561 ; GFX9-LABEL: v_clamp_neg_f16:
563 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
564 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
565 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
566 ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
567 ; GFX9-NEXT: s_waitcnt vmcnt(0)
568 ; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp
569 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
570 ; GFX9-NEXT: s_endpgm
572 ; GFX11-LABEL: v_clamp_neg_f16:
574 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
575 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
576 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
577 ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
578 ; GFX11-NEXT: s_waitcnt vmcnt(0)
579 ; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp
580 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
581 ; GFX11-NEXT: s_nop 0
582 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
583 ; GFX11-NEXT: s_endpgm
584 %tid = call i32 @llvm.amdgcn.workitem.id.x()
585 %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
586 %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
587 %a = load half, ptr addrspace(1) %gep0
588 %fneg.a = fneg half %a
589 %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
590 %med = call half @llvm.minnum.f16(half %max, half 1.0)
592 store half %med, ptr addrspace(1) %out.gep
596 define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
597 ; GFX6-LABEL: v_clamp_negabs_f16:
599 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
600 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
601 ; GFX6-NEXT: s_mov_b32 s6, 0
602 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
603 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
604 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
605 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
606 ; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
607 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
608 ; GFX6-NEXT: s_waitcnt vmcnt(0)
609 ; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -|v2| clamp
610 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
611 ; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
612 ; GFX6-NEXT: s_endpgm
614 ; GFX8-LABEL: v_clamp_negabs_f16:
616 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
617 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
618 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
619 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
620 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
621 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
622 ; GFX8-NEXT: flat_load_ushort v3, v[0:1]
623 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
624 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
625 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
626 ; GFX8-NEXT: s_waitcnt vmcnt(0)
627 ; GFX8-NEXT: v_max_f16_e64 v2, -|v3|, -|v3| clamp
628 ; GFX8-NEXT: flat_store_short v[0:1], v2
629 ; GFX8-NEXT: s_endpgm
631 ; GFX9-LABEL: v_clamp_negabs_f16:
633 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
634 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
635 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
636 ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
637 ; GFX9-NEXT: s_waitcnt vmcnt(0)
638 ; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp
639 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
640 ; GFX9-NEXT: s_endpgm
642 ; GFX11-LABEL: v_clamp_negabs_f16:
644 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
645 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
646 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
647 ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
648 ; GFX11-NEXT: s_waitcnt vmcnt(0)
649 ; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp
650 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
651 ; GFX11-NEXT: s_nop 0
652 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
653 ; GFX11-NEXT: s_endpgm
654 %tid = call i32 @llvm.amdgcn.workitem.id.x()
655 %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
656 %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
657 %a = load half, ptr addrspace(1) %gep0
658 %fabs.a = call half @llvm.fabs.f16(half %a)
659 %fneg.fabs.a = fneg half %fabs.a
661 %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
662 %med = call half @llvm.minnum.f16(half %max, half 1.0)
664 store half %med, ptr addrspace(1) %out.gep
668 define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
669 ; GFX6-LABEL: v_clamp_f64:
671 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
672 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
673 ; GFX6-NEXT: s_mov_b32 s6, 0
674 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0
675 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
676 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
677 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
678 ; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
679 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
680 ; GFX6-NEXT: s_waitcnt vmcnt(0)
681 ; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] clamp
682 ; GFX6-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
683 ; GFX6-NEXT: s_endpgm
685 ; GFX8-LABEL: v_clamp_f64:
687 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
688 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
689 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
690 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
691 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
692 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
693 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
694 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
695 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
696 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
697 ; GFX8-NEXT: s_waitcnt vmcnt(0)
698 ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
699 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
700 ; GFX8-NEXT: s_endpgm
702 ; GFX9-LABEL: v_clamp_f64:
704 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
705 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
706 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
707 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
708 ; GFX9-NEXT: s_waitcnt vmcnt(0)
709 ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
710 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
711 ; GFX9-NEXT: s_endpgm
713 ; GFX11-LABEL: v_clamp_f64:
715 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
716 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
717 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
718 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
719 ; GFX11-NEXT: s_waitcnt vmcnt(0)
720 ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
721 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
722 ; GFX11-NEXT: s_nop 0
723 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
724 ; GFX11-NEXT: s_endpgm
725 %tid = call i32 @llvm.amdgcn.workitem.id.x()
726 %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
727 %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
728 %a = load double, ptr addrspace(1) %gep0
729 %max = call double @llvm.maxnum.f64(double %a, double 0.0)
730 %med = call double @llvm.minnum.f64(double %max, double 1.0)
732 store double %med, ptr addrspace(1) %out.gep
736 define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
737 ; GFX6-LABEL: v_clamp_neg_f64:
739 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
740 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
741 ; GFX6-NEXT: s_mov_b32 s6, 0
742 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0
743 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
744 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
745 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
746 ; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
747 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
748 ; GFX6-NEXT: s_waitcnt vmcnt(0)
749 ; GFX6-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3] clamp
750 ; GFX6-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
751 ; GFX6-NEXT: s_endpgm
753 ; GFX8-LABEL: v_clamp_neg_f64:
755 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
756 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
757 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
758 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
759 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
760 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
761 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
762 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
763 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
764 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
765 ; GFX8-NEXT: s_waitcnt vmcnt(0)
766 ; GFX8-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
767 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
768 ; GFX8-NEXT: s_endpgm
770 ; GFX9-LABEL: v_clamp_neg_f64:
772 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
773 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
774 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
775 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
776 ; GFX9-NEXT: s_waitcnt vmcnt(0)
777 ; GFX9-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
778 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
779 ; GFX9-NEXT: s_endpgm
781 ; GFX11-LABEL: v_clamp_neg_f64:
783 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
784 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
785 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
786 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
787 ; GFX11-NEXT: s_waitcnt vmcnt(0)
788 ; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
789 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
790 ; GFX11-NEXT: s_nop 0
791 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
792 ; GFX11-NEXT: s_endpgm
793 %tid = call i32 @llvm.amdgcn.workitem.id.x()
794 %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
795 %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
796 %a = load double, ptr addrspace(1) %gep0
797 %fneg.a = fneg double %a
798 %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
799 %med = call double @llvm.minnum.f64(double %max, double 1.0)
801 store double %med, ptr addrspace(1) %out.gep
805 define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
806 ; GFX6-LABEL: v_clamp_negabs_f64:
808 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
809 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
810 ; GFX6-NEXT: s_mov_b32 s6, 0
811 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0
812 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
813 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
814 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
815 ; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
816 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
817 ; GFX6-NEXT: s_waitcnt vmcnt(0)
818 ; GFX6-NEXT: v_max_f64 v[2:3], -|v[2:3]|, -|v[2:3]| clamp
819 ; GFX6-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
820 ; GFX6-NEXT: s_endpgm
822 ; GFX8-LABEL: v_clamp_negabs_f64:
824 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
825 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
826 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
827 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
828 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
829 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
830 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
831 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
832 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
833 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
834 ; GFX8-NEXT: s_waitcnt vmcnt(0)
835 ; GFX8-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
836 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
837 ; GFX8-NEXT: s_endpgm
839 ; GFX9-LABEL: v_clamp_negabs_f64:
841 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
842 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
843 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
844 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
845 ; GFX9-NEXT: s_waitcnt vmcnt(0)
846 ; GFX9-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
847 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
848 ; GFX9-NEXT: s_endpgm
850 ; GFX11-LABEL: v_clamp_negabs_f64:
852 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
853 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
854 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
855 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
856 ; GFX11-NEXT: s_waitcnt vmcnt(0)
857 ; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
858 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
859 ; GFX11-NEXT: s_nop 0
860 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
861 ; GFX11-NEXT: s_endpgm
862 %tid = call i32 @llvm.amdgcn.workitem.id.x()
863 %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
864 %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
865 %a = load double, ptr addrspace(1) %gep0
866 %fabs.a = call double @llvm.fabs.f64(double %a)
867 %fneg.fabs.a = fneg double %fabs.a
869 %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
870 %med = call double @llvm.minnum.f64(double %max, double 1.0)
872 store double %med, ptr addrspace(1) %out.gep
876 define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
877 ; GFX6-LABEL: v_clamp_med3_aby_negzero_f32:
879 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
880 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
881 ; GFX6-NEXT: s_mov_b32 s6, 0
882 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
883 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
884 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
885 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
886 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
887 ; GFX6-NEXT: s_brev_b32 s4, 1
888 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
889 ; GFX6-NEXT: s_waitcnt vmcnt(0)
890 ; GFX6-NEXT: v_med3_f32 v2, s4, 1.0, v2
891 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
892 ; GFX6-NEXT: s_endpgm
894 ; GFX8-LABEL: v_clamp_med3_aby_negzero_f32:
896 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
897 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
898 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
899 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
900 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
901 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
902 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
903 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
904 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
905 ; GFX8-NEXT: s_brev_b32 s0, 1
906 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
907 ; GFX8-NEXT: s_waitcnt vmcnt(0)
908 ; GFX8-NEXT: v_med3_f32 v2, s0, 1.0, v3
909 ; GFX8-NEXT: flat_store_dword v[0:1], v2
910 ; GFX8-NEXT: s_endpgm
912 ; GFX9-LABEL: v_clamp_med3_aby_negzero_f32:
914 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
915 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
916 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
917 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
918 ; GFX9-NEXT: s_brev_b32 s2, 1
919 ; GFX9-NEXT: s_waitcnt vmcnt(0)
920 ; GFX9-NEXT: v_med3_f32 v1, s2, 1.0, v1
921 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
922 ; GFX9-NEXT: s_endpgm
924 ; GFX11-LABEL: v_clamp_med3_aby_negzero_f32:
926 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
927 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
928 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
929 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
930 ; GFX11-NEXT: s_waitcnt vmcnt(0)
931 ; GFX11-NEXT: v_med3_f32 v1, 0x80000000, 1.0, v1
932 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
933 ; GFX11-NEXT: s_nop 0
934 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
935 ; GFX11-NEXT: s_endpgm
936 %tid = call i32 @llvm.amdgcn.workitem.id.x()
937 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
938 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
939 %a = load float, ptr addrspace(1) %gep0
940 %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
941 store float %med, ptr addrspace(1) %out.gep
945 define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
946 ; GFX6-LABEL: v_clamp_med3_aby_f32:
948 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
949 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
950 ; GFX6-NEXT: s_mov_b32 s6, 0
951 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
952 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
953 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
954 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
955 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
956 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
957 ; GFX6-NEXT: s_waitcnt vmcnt(0)
958 ; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp
959 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
960 ; GFX6-NEXT: s_endpgm
962 ; GFX8-LABEL: v_clamp_med3_aby_f32:
964 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
965 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
966 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
967 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
968 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
969 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
970 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
971 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
972 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
973 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
974 ; GFX8-NEXT: s_waitcnt vmcnt(0)
975 ; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
976 ; GFX8-NEXT: flat_store_dword v[0:1], v2
977 ; GFX8-NEXT: s_endpgm
979 ; GFX9-LABEL: v_clamp_med3_aby_f32:
981 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
982 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
983 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
984 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
985 ; GFX9-NEXT: s_waitcnt vmcnt(0)
986 ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
987 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
988 ; GFX9-NEXT: s_endpgm
990 ; GFX11-LABEL: v_clamp_med3_aby_f32:
992 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
993 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
994 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
995 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
996 ; GFX11-NEXT: s_waitcnt vmcnt(0)
997 ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
998 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
999 ; GFX11-NEXT: s_nop 0
1000 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1001 ; GFX11-NEXT: s_endpgm
1002 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1003 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1004 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1005 %a = load float, ptr addrspace(1) %gep0
1006 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
1007 store float %med, ptr addrspace(1) %out.gep
1011 define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1012 ; GFX6-LABEL: v_clamp_med3_bay_f32:
1014 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1015 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1016 ; GFX6-NEXT: s_mov_b32 s6, 0
1017 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1018 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1019 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1020 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
1021 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1022 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
1023 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1024 ; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp
1025 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1026 ; GFX6-NEXT: s_endpgm
1028 ; GFX8-LABEL: v_clamp_med3_bay_f32:
1030 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1031 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1032 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1033 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1034 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1035 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1036 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1037 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1038 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1039 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1040 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1041 ; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
1042 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1043 ; GFX8-NEXT: s_endpgm
1045 ; GFX9-LABEL: v_clamp_med3_bay_f32:
1047 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1048 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1049 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1050 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1051 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1052 ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
1053 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1054 ; GFX9-NEXT: s_endpgm
1056 ; GFX11-LABEL: v_clamp_med3_bay_f32:
1058 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1059 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1060 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1061 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1062 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1063 ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
1064 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1065 ; GFX11-NEXT: s_nop 0
1066 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1067 ; GFX11-NEXT: s_endpgm
1068 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1069 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1070 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1071 %a = load float, ptr addrspace(1) %gep0
1072 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
1073 store float %med, ptr addrspace(1) %out.gep
1077 define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1078 ; GFX6-LABEL: v_clamp_med3_yab_f32:
1080 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1081 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1082 ; GFX6-NEXT: s_mov_b32 s6, 0
1083 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1084 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1085 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1086 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
1087 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1088 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
1089 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1090 ; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp
1091 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1092 ; GFX6-NEXT: s_endpgm
1094 ; GFX8-LABEL: v_clamp_med3_yab_f32:
1096 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1097 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1098 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1099 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1100 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1101 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1102 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1103 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1104 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1105 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1106 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1107 ; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
1108 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1109 ; GFX8-NEXT: s_endpgm
1111 ; GFX9-LABEL: v_clamp_med3_yab_f32:
1113 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1114 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1115 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1116 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1117 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1118 ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
1119 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1120 ; GFX9-NEXT: s_endpgm
1122 ; GFX11-LABEL: v_clamp_med3_yab_f32:
1124 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1125 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1126 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1127 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1128 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1129 ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
1130 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1131 ; GFX11-NEXT: s_nop 0
1132 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1133 ; GFX11-NEXT: s_endpgm
1134 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1135 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1136 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1137 %a = load float, ptr addrspace(1) %gep0
1138 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
1139 store float %med, ptr addrspace(1) %out.gep
1143 define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1144 ; GFX6-LABEL: v_clamp_med3_yba_f32:
1146 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1147 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1148 ; GFX6-NEXT: s_mov_b32 s6, 0
1149 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1150 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1151 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1152 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
1153 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1154 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
1155 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1156 ; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp
1157 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1158 ; GFX6-NEXT: s_endpgm
1160 ; GFX8-LABEL: v_clamp_med3_yba_f32:
1162 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1163 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1164 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1165 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1166 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1167 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1168 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1169 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1170 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1171 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1172 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1173 ; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
1174 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1175 ; GFX8-NEXT: s_endpgm
1177 ; GFX9-LABEL: v_clamp_med3_yba_f32:
1179 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1180 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1181 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1182 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1183 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1184 ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
1185 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1186 ; GFX9-NEXT: s_endpgm
1188 ; GFX11-LABEL: v_clamp_med3_yba_f32:
1190 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1191 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1192 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1193 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1194 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1195 ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
1196 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1197 ; GFX11-NEXT: s_nop 0
1198 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1199 ; GFX11-NEXT: s_endpgm
1200 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1201 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1202 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1203 %a = load float, ptr addrspace(1) %gep0
1204 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
1205 store float %med, ptr addrspace(1) %out.gep
1209 define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1210 ; GFX6-LABEL: v_clamp_med3_ayb_f32:
1212 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1213 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1214 ; GFX6-NEXT: s_mov_b32 s6, 0
1215 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1216 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1217 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1218 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
1219 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1220 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
1221 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1222 ; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp
1223 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1224 ; GFX6-NEXT: s_endpgm
1226 ; GFX8-LABEL: v_clamp_med3_ayb_f32:
1228 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1229 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1230 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1231 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1232 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1233 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1234 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1235 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1236 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1237 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1238 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1239 ; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
1240 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1241 ; GFX8-NEXT: s_endpgm
1243 ; GFX9-LABEL: v_clamp_med3_ayb_f32:
1245 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1246 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1247 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1248 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1249 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1250 ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
1251 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1252 ; GFX9-NEXT: s_endpgm
1254 ; GFX11-LABEL: v_clamp_med3_ayb_f32:
1256 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1257 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1258 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1259 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1260 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1261 ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
1262 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1263 ; GFX11-NEXT: s_nop 0
1264 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1265 ; GFX11-NEXT: s_endpgm
1266 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1267 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1268 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1269 %a = load float, ptr addrspace(1) %gep0
1270 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
1271 store float %med, ptr addrspace(1) %out.gep
1275 define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
1276 ; GFX6-LABEL: v_clamp_med3_bya_f32:
1278 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1279 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1280 ; GFX6-NEXT: s_mov_b32 s6, 0
1281 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1282 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1283 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1284 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
1285 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1286 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
1287 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1288 ; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp
1289 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1290 ; GFX6-NEXT: s_endpgm
1292 ; GFX8-LABEL: v_clamp_med3_bya_f32:
1294 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1295 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1296 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1297 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1298 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1299 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1300 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1301 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1302 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1303 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1304 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1305 ; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
1306 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1307 ; GFX8-NEXT: s_endpgm
1309 ; GFX9-LABEL: v_clamp_med3_bya_f32:
1311 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1312 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1313 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1314 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1315 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1316 ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
1317 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1318 ; GFX9-NEXT: s_endpgm
1320 ; GFX11-LABEL: v_clamp_med3_bya_f32:
1322 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1323 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1324 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1325 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1326 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1327 ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
1328 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1329 ; GFX11-NEXT: s_nop 0
1330 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1331 ; GFX11-NEXT: s_endpgm
1332 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1333 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1334 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1335 %a = load float, ptr addrspace(1) %gep0
1336 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
1337 store float %med, ptr addrspace(1) %out.gep
1341 define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) #0 {
1342 ; GFX6-LABEL: v_clamp_constants_to_one_f32:
1344 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1345 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
1346 ; GFX6-NEXT: s_mov_b32 s2, 0
1347 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1348 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1349 ; GFX6-NEXT: v_mov_b32_e32 v2, 1.0
1350 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1351 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1352 ; GFX6-NEXT: s_endpgm
1354 ; GFX8-LABEL: v_clamp_constants_to_one_f32:
1356 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1357 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1358 ; GFX8-NEXT: v_mov_b32_e32 v2, 1.0
1359 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1360 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1361 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1362 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1363 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1364 ; GFX8-NEXT: s_endpgm
1366 ; GFX9-LABEL: v_clamp_constants_to_one_f32:
1368 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1369 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1370 ; GFX9-NEXT: v_mov_b32_e32 v1, 1.0
1371 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1372 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1373 ; GFX9-NEXT: s_endpgm
1375 ; GFX11-LABEL: v_clamp_constants_to_one_f32:
1377 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1378 ; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0
1379 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1380 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1381 ; GFX11-NEXT: s_nop 0
1382 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1383 ; GFX11-NEXT: s_endpgm
1384 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1385 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1386 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
1387 store float %med, ptr addrspace(1) %out.gep
1391 define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) #0 {
1392 ; GFX6-LABEL: v_clamp_constants_to_zero_f32:
1394 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1395 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
1396 ; GFX6-NEXT: s_mov_b32 s2, 0
1397 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1398 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1399 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1400 ; GFX6-NEXT: buffer_store_dword v1, v[0:1], s[0:3], 0 addr64
1401 ; GFX6-NEXT: s_endpgm
1403 ; GFX8-LABEL: v_clamp_constants_to_zero_f32:
1405 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1406 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1407 ; GFX8-NEXT: v_mov_b32_e32 v2, 0
1408 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1409 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1410 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1411 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1412 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1413 ; GFX8-NEXT: s_endpgm
1415 ; GFX9-LABEL: v_clamp_constants_to_zero_f32:
1417 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1418 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1419 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1420 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1421 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1422 ; GFX9-NEXT: s_endpgm
1424 ; GFX11-LABEL: v_clamp_constants_to_zero_f32:
1426 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1427 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1428 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1429 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1430 ; GFX11-NEXT: s_nop 0
1431 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1432 ; GFX11-NEXT: s_endpgm
1433 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1434 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1435 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
1436 store float %med, ptr addrspace(1) %out.gep
1440 define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) #0 {
1441 ; GFX6-LABEL: v_clamp_constant_preserve_f32:
1443 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1444 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
1445 ; GFX6-NEXT: s_mov_b32 s2, 0
1446 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1447 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1448 ; GFX6-NEXT: v_mov_b32_e32 v2, 0.5
1449 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1450 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1451 ; GFX6-NEXT: s_endpgm
1453 ; GFX8-LABEL: v_clamp_constant_preserve_f32:
1455 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1456 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1457 ; GFX8-NEXT: v_mov_b32_e32 v2, 0.5
1458 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1459 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1460 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1461 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1462 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1463 ; GFX8-NEXT: s_endpgm
1465 ; GFX9-LABEL: v_clamp_constant_preserve_f32:
1467 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1468 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1469 ; GFX9-NEXT: v_mov_b32_e32 v1, 0.5
1470 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1471 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1472 ; GFX9-NEXT: s_endpgm
1474 ; GFX11-LABEL: v_clamp_constant_preserve_f32:
1476 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1477 ; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0
1478 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1479 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1480 ; GFX11-NEXT: s_nop 0
1481 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1482 ; GFX11-NEXT: s_endpgm
1483 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1484 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1485 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
1486 store float %med, ptr addrspace(1) %out.gep
1490 define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) %out) #0 {
1491 ; GFX6-LABEL: v_clamp_constant_preserve_denorm_f32:
1493 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1494 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
1495 ; GFX6-NEXT: s_mov_b32 s2, 0
1496 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1497 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1498 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x7fffff
1499 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1500 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1501 ; GFX6-NEXT: s_endpgm
1503 ; GFX8-LABEL: v_clamp_constant_preserve_denorm_f32:
1505 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1506 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1507 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fffff
1508 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1509 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1510 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1511 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1512 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1513 ; GFX8-NEXT: s_endpgm
1515 ; GFX9-LABEL: v_clamp_constant_preserve_denorm_f32:
1517 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1518 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1519 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff
1520 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1521 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1522 ; GFX9-NEXT: s_endpgm
1524 ; GFX11-LABEL: v_clamp_constant_preserve_denorm_f32:
1526 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1527 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0
1528 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1529 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1530 ; GFX11-NEXT: s_nop 0
1531 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1532 ; GFX11-NEXT: s_endpgm
1533 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1534 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1535 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
1536 store float %med, ptr addrspace(1) %out.gep
1540 define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 {
1541 ; GFX6-LABEL: v_clamp_constant_qnan_f32:
1543 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1544 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
1545 ; GFX6-NEXT: s_mov_b32 s2, 0
1546 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1547 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1548 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1549 ; GFX6-NEXT: buffer_store_dword v1, v[0:1], s[0:3], 0 addr64
1550 ; GFX6-NEXT: s_endpgm
1552 ; GFX8-LABEL: v_clamp_constant_qnan_f32:
1554 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1555 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1556 ; GFX8-NEXT: v_mov_b32_e32 v2, 0
1557 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1558 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1559 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1560 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1561 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1562 ; GFX8-NEXT: s_endpgm
1564 ; GFX9-LABEL: v_clamp_constant_qnan_f32:
1566 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1567 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1568 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1569 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1570 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1571 ; GFX9-NEXT: s_endpgm
1573 ; GFX11-LABEL: v_clamp_constant_qnan_f32:
1575 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1576 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1577 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1578 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1579 ; GFX11-NEXT: s_nop 0
1580 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1581 ; GFX11-NEXT: s_endpgm
1582 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1583 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1584 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
1585 store float %med, ptr addrspace(1) %out.gep
1589 define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 {
1590 ; GFX6-LABEL: v_clamp_constant_snan_f32:
1592 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1593 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
1594 ; GFX6-NEXT: s_mov_b32 s2, 0
1595 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1596 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1597 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1598 ; GFX6-NEXT: buffer_store_dword v1, v[0:1], s[0:3], 0 addr64
1599 ; GFX6-NEXT: s_endpgm
1601 ; GFX8-LABEL: v_clamp_constant_snan_f32:
1603 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1604 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1605 ; GFX8-NEXT: v_mov_b32_e32 v2, 0
1606 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1607 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1608 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1609 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1610 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1611 ; GFX8-NEXT: s_endpgm
1613 ; GFX9-LABEL: v_clamp_constant_snan_f32:
1615 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1616 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1617 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1618 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1619 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1620 ; GFX9-NEXT: s_endpgm
1622 ; GFX11-LABEL: v_clamp_constant_snan_f32:
1624 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1625 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1626 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1627 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1628 ; GFX11-NEXT: s_nop 0
1629 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1630 ; GFX11-NEXT: s_endpgm
1631 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1632 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1633 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
1634 store float %med, ptr addrspace(1) %out.gep
1638 ; ---------------------------------------------------------------------
1639 ; Test non-default behaviors enabling snans and disabling dx10_clamp
1640 ; ---------------------------------------------------------------------
1642 define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
1643 ; GFX6-LABEL: v_clamp_f32_no_dx10_clamp:
1645 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1646 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1647 ; GFX6-NEXT: s_mov_b32 s6, 0
1648 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1649 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1650 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1651 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
1652 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1653 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
1654 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1655 ; GFX6-NEXT: v_add_f32_e32 v2, 0.5, v2
1656 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, 1.0
1657 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1658 ; GFX6-NEXT: s_endpgm
1660 ; GFX8-LABEL: v_clamp_f32_no_dx10_clamp:
1662 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1663 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1664 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1665 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1666 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1667 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1668 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1669 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1670 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1671 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1672 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1673 ; GFX8-NEXT: v_add_f32_e32 v2, 0.5, v3
1674 ; GFX8-NEXT: v_med3_f32 v2, v2, 0, 1.0
1675 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1676 ; GFX8-NEXT: s_endpgm
1678 ; GFX9-LABEL: v_clamp_f32_no_dx10_clamp:
1680 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1681 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1682 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1683 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1684 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1685 ; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1
1686 ; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
1687 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1688 ; GFX9-NEXT: s_endpgm
1690 ; GFX11-LABEL: v_clamp_f32_no_dx10_clamp:
1692 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1693 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1694 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1695 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1696 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1697 ; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
1698 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1699 ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
1700 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1701 ; GFX11-NEXT: s_nop 0
1702 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1703 ; GFX11-NEXT: s_endpgm
1704 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1705 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1706 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1707 %a = load float, ptr addrspace(1) %gep0
1708 %a.nnan = fadd nnan float %a, 0.5
1709 %max = call float @llvm.maxnum.f32(float %a.nnan, float 0.0)
1710 %med = call float @llvm.minnum.f32(float %max, float 1.0)
1712 store float %med, ptr addrspace(1) %out.gep
1716 define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 {
1717 ; GFX6-LABEL: v_clamp_f32_snan_dx10clamp:
1719 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1720 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1721 ; GFX6-NEXT: s_mov_b32 s6, 0
1722 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1723 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1724 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1725 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
1726 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1727 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
1728 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1729 ; GFX6-NEXT: v_add_f32_e64 v2, v2, 0.5 clamp
1730 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1731 ; GFX6-NEXT: s_endpgm
1733 ; GFX8-LABEL: v_clamp_f32_snan_dx10clamp:
1735 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1736 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1737 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1738 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1739 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1740 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1741 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1742 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1743 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1744 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1745 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1746 ; GFX8-NEXT: v_add_f32_e64 v2, v3, 0.5 clamp
1747 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1748 ; GFX8-NEXT: s_endpgm
1750 ; GFX9-LABEL: v_clamp_f32_snan_dx10clamp:
1752 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1753 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1754 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1755 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1756 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1757 ; GFX9-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
1758 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1759 ; GFX9-NEXT: s_endpgm
1761 ; GFX11-LABEL: v_clamp_f32_snan_dx10clamp:
1763 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1764 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1765 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1766 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1767 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1768 ; GFX11-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
1769 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1770 ; GFX11-NEXT: s_nop 0
1771 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1772 ; GFX11-NEXT: s_endpgm
1773 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1774 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1775 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1776 %a = load float, ptr addrspace(1) %gep0
1777 %add = fadd float %a, 0.5
1778 %max = call float @llvm.maxnum.f32(float %add, float 0.0)
1779 %med = call float @llvm.minnum.f32(float %max, float 1.0)
1781 store float %med, ptr addrspace(1) %out.gep
1785 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
1786 ; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp:
1788 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1789 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1790 ; GFX6-NEXT: s_mov_b32 s6, 0
1791 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1792 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1793 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1794 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
1795 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1796 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
1797 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1798 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
1799 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, 1.0
1800 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1801 ; GFX6-NEXT: s_endpgm
1803 ; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp:
1805 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1806 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1807 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1808 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1809 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1810 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1811 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1812 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1813 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1814 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1815 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1816 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3
1817 ; GFX8-NEXT: v_med3_f32 v2, v2, 0, 1.0
1818 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1819 ; GFX8-NEXT: s_endpgm
1821 ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp:
1823 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1824 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1825 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1826 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1827 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1828 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
1829 ; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
1830 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1831 ; GFX9-NEXT: s_endpgm
1833 ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp:
1835 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1836 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1837 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1838 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1839 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1840 ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
1841 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1842 ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
1843 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1844 ; GFX11-NEXT: s_nop 0
1845 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1846 ; GFX11-NEXT: s_endpgm
1847 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1848 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1849 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1850 %a = load float, ptr addrspace(1) %gep0
1851 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
1852 %med = call float @llvm.minnum.f32(float %max, float 1.0)
1854 store float %med, ptr addrspace(1) %out.gep
1858 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
1859 ; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
1861 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1862 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1863 ; GFX6-NEXT: s_mov_b32 s6, 0
1864 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1865 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1866 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1867 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
1868 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1869 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
1870 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1871 ; GFX6-NEXT: v_add_f32_e32 v2, 1.0, v2
1872 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, 1.0
1873 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1874 ; GFX6-NEXT: s_endpgm
1876 ; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
1878 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1879 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1880 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1881 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1882 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1883 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1884 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1885 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1886 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1887 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1888 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1889 ; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3
1890 ; GFX8-NEXT: v_med3_f32 v2, v2, 0, 1.0
1891 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1892 ; GFX8-NEXT: s_endpgm
1894 ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
1896 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1897 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1898 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1899 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1900 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1901 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
1902 ; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
1903 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1904 ; GFX9-NEXT: s_endpgm
1906 ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
1908 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1909 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1910 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1911 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1912 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1913 ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
1914 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1915 ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
1916 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1917 ; GFX11-NEXT: s_nop 0
1918 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1919 ; GFX11-NEXT: s_endpgm
1920 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1921 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1922 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1923 %a = load float, ptr addrspace(1) %gep0
1924 %add = fadd nnan float %a, 1.0
1925 %max = call float @llvm.maxnum.f32(float %add, float 0.0)
1926 %med = call float @llvm.minnum.f32(float %max, float 1.0)
1928 store float %med, ptr addrspace(1) %out.gep
1932 define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
1933 ; GFX6-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
1935 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1936 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1937 ; GFX6-NEXT: s_mov_b32 s6, 0
1938 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1939 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
1940 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1941 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
1942 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1943 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
1944 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1945 ; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp
1946 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1947 ; GFX6-NEXT: s_endpgm
1949 ; GFX8-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
1951 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1952 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1953 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1954 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1955 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1956 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1957 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
1958 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1959 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1960 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1961 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1962 ; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
1963 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1964 ; GFX8-NEXT: s_endpgm
1966 ; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
1968 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1969 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1970 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1971 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1972 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1973 ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
1974 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1975 ; GFX9-NEXT: s_endpgm
1977 ; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
1979 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1980 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1981 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1982 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1983 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1984 ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
1985 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1986 ; GFX11-NEXT: s_nop 0
1987 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1988 ; GFX11-NEXT: s_endpgm
1989 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1990 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
1991 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
1992 %a = load float, ptr addrspace(1) %gep0
1993 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
1994 store float %med, ptr addrspace(1) %out.gep
1998 define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
1999 ; GFX6-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
2001 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2002 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
2003 ; GFX6-NEXT: s_mov_b32 s6, 0
2004 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2005 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2006 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2007 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
2008 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2009 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
2010 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2011 ; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp
2012 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2013 ; GFX6-NEXT: s_endpgm
2015 ; GFX8-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
2017 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2018 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2019 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2020 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2021 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2022 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2023 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2024 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2025 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2026 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2027 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2028 ; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
2029 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2030 ; GFX8-NEXT: s_endpgm
2032 ; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
2034 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2035 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2036 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2037 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2038 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2039 ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
2040 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2041 ; GFX9-NEXT: s_endpgm
2043 ; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
2045 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2047 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2048 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2049 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2050 ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
2051 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2052 ; GFX11-NEXT: s_nop 0
2053 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2054 ; GFX11-NEXT: s_endpgm
2055 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2056 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2057 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2058 %a = load float, ptr addrspace(1) %gep0
2059 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
2060 store float %med, ptr addrspace(1) %out.gep
2064 define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
2065 ; GFX6-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
2067 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2068 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
2069 ; GFX6-NEXT: s_mov_b32 s6, 0
2070 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2071 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2072 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2073 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
2074 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2075 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
2076 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2077 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, 1.0
2078 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2079 ; GFX6-NEXT: s_endpgm
2081 ; GFX8-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
2083 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2084 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2085 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2086 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2087 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2088 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2089 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2090 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2091 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2092 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2093 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2094 ; GFX8-NEXT: v_med3_f32 v2, v3, 0, 1.0
2095 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2096 ; GFX8-NEXT: s_endpgm
2098 ; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
2100 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2101 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2102 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2103 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2104 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2105 ; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
2106 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2107 ; GFX9-NEXT: s_endpgm
2109 ; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
2111 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2112 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2113 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2114 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2115 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2116 ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
2117 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2118 ; GFX11-NEXT: s_nop 0
2119 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2120 ; GFX11-NEXT: s_endpgm
2121 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2122 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2123 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2124 %a = load float, ptr addrspace(1) %gep0
2125 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
2126 store float %med, ptr addrspace(1) %out.gep
2130 define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
2131 ; GFX6-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
2133 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2134 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
2135 ; GFX6-NEXT: s_mov_b32 s6, 0
2136 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2137 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2138 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2139 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
2140 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2141 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
2142 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2143 ; GFX6-NEXT: v_med3_f32 v2, v2, 1.0, 0
2144 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2145 ; GFX6-NEXT: s_endpgm
2147 ; GFX8-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
2149 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2150 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2151 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2152 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2153 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2154 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2155 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2156 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2157 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2158 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2159 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2160 ; GFX8-NEXT: v_med3_f32 v2, v3, 1.0, 0
2161 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2162 ; GFX8-NEXT: s_endpgm
2164 ; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
2166 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2167 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2168 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2169 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2170 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2171 ; GFX9-NEXT: v_med3_f32 v1, v1, 1.0, 0
2172 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2173 ; GFX9-NEXT: s_endpgm
2175 ; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
2177 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2178 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2179 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2180 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2181 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2182 ; GFX11-NEXT: v_med3_f32 v1, v1, 1.0, 0
2183 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2184 ; GFX11-NEXT: s_nop 0
2185 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2186 ; GFX11-NEXT: s_endpgm
2187 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2188 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2189 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2190 %a = load float, ptr addrspace(1) %gep0
2191 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
2192 store float %med, ptr addrspace(1) %out.gep
2196 define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
2197 ; GFX6-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
2199 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2200 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
2201 ; GFX6-NEXT: s_mov_b32 s6, 0
2202 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2203 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2204 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2205 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
2206 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2207 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
2208 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2209 ; GFX6-NEXT: v_med3_f32 v2, 0, v2, 1.0
2210 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2211 ; GFX6-NEXT: s_endpgm
2213 ; GFX8-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
2215 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2216 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2217 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2218 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2219 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2220 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2221 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2222 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2223 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2224 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2225 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2226 ; GFX8-NEXT: v_med3_f32 v2, 0, v3, 1.0
2227 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2228 ; GFX8-NEXT: s_endpgm
2230 ; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
2232 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2233 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2234 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2235 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2236 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2237 ; GFX9-NEXT: v_med3_f32 v1, 0, v1, 1.0
2238 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2239 ; GFX9-NEXT: s_endpgm
2241 ; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
2243 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2244 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2245 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2246 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2247 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2248 ; GFX11-NEXT: v_med3_f32 v1, 0, v1, 1.0
2249 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2250 ; GFX11-NEXT: s_nop 0
2251 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2252 ; GFX11-NEXT: s_endpgm
2253 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2254 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2255 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2256 %a = load float, ptr addrspace(1) %gep0
2257 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
2258 store float %med, ptr addrspace(1) %out.gep
2262 define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
2263 ; GFX6-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
2265 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2266 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
2267 ; GFX6-NEXT: s_mov_b32 s6, 0
2268 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2269 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2270 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2271 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
2272 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2273 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
2274 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2275 ; GFX6-NEXT: v_med3_f32 v2, 1.0, v2, 0
2276 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2277 ; GFX6-NEXT: s_endpgm
2279 ; GFX8-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
2281 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2282 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2283 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2284 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2285 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2286 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2287 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2288 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2289 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2290 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2291 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2292 ; GFX8-NEXT: v_med3_f32 v2, 1.0, v3, 0
2293 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2294 ; GFX8-NEXT: s_endpgm
2296 ; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
2298 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2299 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2301 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2302 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2303 ; GFX9-NEXT: v_med3_f32 v1, 1.0, v1, 0
2304 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2305 ; GFX9-NEXT: s_endpgm
2307 ; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
2309 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2310 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2311 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2312 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2313 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2314 ; GFX11-NEXT: v_med3_f32 v1, 1.0, v1, 0
2315 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2316 ; GFX11-NEXT: s_nop 0
2317 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2318 ; GFX11-NEXT: s_endpgm
2319 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2320 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
2321 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2322 %a = load float, ptr addrspace(1) %gep0
2323 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
2324 store float %med, ptr addrspace(1) %out.gep
2328 define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 {
2329 ; GFX6-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
2331 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2332 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
2333 ; GFX6-NEXT: s_mov_b32 s2, 0
2334 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2335 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2336 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x7fc00000
2337 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2338 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2339 ; GFX6-NEXT: s_endpgm
2341 ; GFX8-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
2343 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2344 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2345 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000
2346 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2347 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2348 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2349 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2350 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2351 ; GFX8-NEXT: s_endpgm
2353 ; GFX9-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
2355 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2356 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2357 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000
2358 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2359 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2360 ; GFX9-NEXT: s_endpgm
2362 ; GFX11-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
2364 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2365 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0
2366 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2367 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2368 ; GFX11-NEXT: s_nop 0
2369 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2370 ; GFX11-NEXT: s_endpgm
2371 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2372 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2373 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
2374 store float %med, ptr addrspace(1) %out.gep
2378 define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 {
2379 ; GFX6-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
2381 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2382 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
2383 ; GFX6-NEXT: s_mov_b32 s2, 0
2384 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2385 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2386 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x7f800001
2387 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2388 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2389 ; GFX6-NEXT: s_endpgm
2391 ; GFX8-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
2393 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2394 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2395 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800001
2396 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2397 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2398 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2399 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2400 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2401 ; GFX8-NEXT: s_endpgm
2403 ; GFX9-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
2405 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2406 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2407 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800001
2408 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2409 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2410 ; GFX9-NEXT: s_endpgm
2412 ; GFX11-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
2414 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2415 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0
2416 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2417 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2418 ; GFX11-NEXT: s_nop 0
2419 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2420 ; GFX11-NEXT: s_endpgm
2421 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2422 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
2423 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
2424 store float %med, ptr addrspace(1) %out.gep
2428 define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
2429 ; GFX6-LABEL: v_clamp_v2f16:
2431 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2432 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
2433 ; GFX6-NEXT: s_mov_b32 s6, 0
2434 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2435 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2436 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2437 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
2438 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2439 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
2440 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2441 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2442 ; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
2443 ; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp
2444 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
2445 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
2446 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2447 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
2448 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2449 ; GFX6-NEXT: s_endpgm
2451 ; GFX8-LABEL: v_clamp_v2f16:
2453 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2454 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2455 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2456 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2457 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2458 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2459 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2460 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2461 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2462 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2463 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2464 ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2465 ; GFX8-NEXT: v_max_f16_e64 v3, v3, v3 clamp
2466 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
2467 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2468 ; GFX8-NEXT: s_endpgm
2470 ; GFX9-LABEL: v_clamp_v2f16:
2472 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2473 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2474 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2475 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2476 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2477 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
2478 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2479 ; GFX9-NEXT: s_endpgm
2481 ; GFX11-LABEL: v_clamp_v2f16:
2483 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2484 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2485 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2486 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2487 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2488 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
2489 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2490 ; GFX11-NEXT: s_nop 0
2491 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2492 ; GFX11-NEXT: s_endpgm
2493 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2494 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
2495 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
2496 %a = load <2 x half>, ptr addrspace(1) %gep0
2497 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer)
2498 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
2500 store <2 x half> %med, ptr addrspace(1) %out.gep
2504 define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
2505 ; GFX6-LABEL: v_clamp_v2f16_undef_elt:
2507 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2508 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
2509 ; GFX6-NEXT: s_mov_b32 s6, 0
2510 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2511 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2512 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2513 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
2514 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2515 ; GFX6-NEXT: s_mov_b32 s2, 0x7fc00000
2516 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2517 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
2518 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2519 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
2520 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
2521 ; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
2522 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
2523 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, s2
2524 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
2525 ; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
2526 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
2527 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
2528 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2529 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
2530 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2531 ; GFX6-NEXT: s_endpgm
2533 ; GFX8-LABEL: v_clamp_v2f16_undef_elt:
2535 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2536 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2537 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
2538 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2539 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2540 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2541 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2542 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2543 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2544 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2545 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2546 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2547 ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2548 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
2549 ; GFX8-NEXT: v_max_f16_e32 v2, 0, v2
2550 ; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3
2551 ; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3
2552 ; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2553 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
2554 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2555 ; GFX8-NEXT: s_endpgm
2557 ; GFX9-LABEL: v_clamp_v2f16_undef_elt:
2559 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2560 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2561 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2562 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2563 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2564 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
2565 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2566 ; GFX9-NEXT: s_endpgm
2568 ; GFX11-LABEL: v_clamp_v2f16_undef_elt:
2570 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2571 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2572 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2573 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2574 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2575 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
2576 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2577 ; GFX11-NEXT: s_nop 0
2578 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2579 ; GFX11-NEXT: s_endpgm
2580 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2581 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
2582 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
2583 %a = load <2 x half>, ptr addrspace(1) %gep0
2584 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
2585 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
2587 store <2 x half> %med, ptr addrspace(1) %out.gep
2591 define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
2592 ; GFX6-LABEL: v_clamp_v2f16_not_zero:
2594 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2595 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
2596 ; GFX6-NEXT: s_mov_b32 s6, 0
2597 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2598 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2599 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2600 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
2601 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2602 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
2603 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2604 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
2605 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2606 ; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp
2607 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
2608 ; GFX6-NEXT: v_max_f32_e32 v3, 2.0, v3
2609 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
2610 ; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
2611 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
2612 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2613 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
2614 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2615 ; GFX6-NEXT: s_endpgm
2617 ; GFX8-LABEL: v_clamp_v2f16_not_zero:
2619 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2620 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2621 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2622 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2623 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2624 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2625 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2626 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2627 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2628 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2629 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2630 ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
2631 ; GFX8-NEXT: v_max_f16_e32 v2, 2.0, v2
2632 ; GFX8-NEXT: v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2633 ; GFX8-NEXT: v_min_f16_e32 v2, 1.0, v2
2634 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
2635 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2636 ; GFX8-NEXT: s_endpgm
2638 ; GFX9-LABEL: v_clamp_v2f16_not_zero:
2640 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2641 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2642 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2643 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2644 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2645 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
2646 ; GFX9-NEXT: v_pk_max_f16 v1, v1, 2.0
2647 ; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
2648 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2649 ; GFX9-NEXT: s_endpgm
2651 ; GFX11-LABEL: v_clamp_v2f16_not_zero:
2653 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2654 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2655 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2656 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2657 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2658 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
2659 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2660 ; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0
2661 ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
2662 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2663 ; GFX11-NEXT: s_nop 0
2664 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2665 ; GFX11-NEXT: s_endpgm
2666 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2667 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
2668 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
2669 %a = load <2 x half>, ptr addrspace(1) %gep0
2670 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>)
2671 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
2673 store <2 x half> %med, ptr addrspace(1) %out.gep
2677 define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
2678 ; GFX6-LABEL: v_clamp_v2f16_not_one:
2680 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2681 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
2682 ; GFX6-NEXT: s_mov_b32 s6, 0
2683 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2684 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2685 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2686 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
2687 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2688 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
2689 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2690 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2691 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
2692 ; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
2693 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
2694 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
2695 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, 0
2696 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
2697 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2698 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
2699 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2700 ; GFX6-NEXT: s_endpgm
2702 ; GFX8-LABEL: v_clamp_v2f16_not_one:
2704 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2705 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2706 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2707 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2708 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2709 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2710 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2711 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2712 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2713 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2714 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2715 ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
2716 ; GFX8-NEXT: v_max_f16_e32 v2, 0, v2
2717 ; GFX8-NEXT: v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2718 ; GFX8-NEXT: v_min_f16_e32 v2, 0, v2
2719 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
2720 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2721 ; GFX8-NEXT: s_endpgm
2723 ; GFX9-LABEL: v_clamp_v2f16_not_one:
2725 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2726 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2727 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2728 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2729 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2730 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
2731 ; GFX9-NEXT: v_pk_max_f16 v1, v1, 0
2732 ; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
2733 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2734 ; GFX9-NEXT: s_endpgm
2736 ; GFX11-LABEL: v_clamp_v2f16_not_one:
2738 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2739 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2740 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2741 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2742 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2743 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
2744 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2745 ; GFX11-NEXT: v_pk_max_f16 v1, v1, 0
2746 ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
2747 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2748 ; GFX11-NEXT: s_nop 0
2749 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2750 ; GFX11-NEXT: s_endpgm
2751 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2752 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
2753 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
2754 %a = load <2 x half>, ptr addrspace(1) %gep0
2755 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>)
2756 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>)
2758 store <2 x half> %med, ptr addrspace(1) %out.gep
2762 define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
2763 ; GFX6-LABEL: v_clamp_neg_v2f16:
2765 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2766 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
2767 ; GFX6-NEXT: s_mov_b32 s6, 0
2768 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2769 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2770 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2771 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
2772 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2773 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
2774 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2775 ; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
2776 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2777 ; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
2778 ; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp
2779 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
2780 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
2781 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2782 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
2783 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2784 ; GFX6-NEXT: s_endpgm
2786 ; GFX8-LABEL: v_clamp_neg_v2f16:
2788 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2789 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2790 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2791 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2792 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2793 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2794 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2795 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2796 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2797 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2798 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2799 ; GFX8-NEXT: v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2800 ; GFX8-NEXT: v_max_f16_e64 v3, -v3, -v3 clamp
2801 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
2802 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2803 ; GFX8-NEXT: s_endpgm
2805 ; GFX9-LABEL: v_clamp_neg_v2f16:
2807 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2808 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2809 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2810 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2811 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2812 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
2813 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2814 ; GFX9-NEXT: s_endpgm
2816 ; GFX11-LABEL: v_clamp_neg_v2f16:
2818 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2819 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2820 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2821 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2822 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2823 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
2824 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2825 ; GFX11-NEXT: s_nop 0
2826 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2827 ; GFX11-NEXT: s_endpgm
2828 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2829 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
2830 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
2831 %a = load <2 x half>, ptr addrspace(1) %gep0
2832 %fneg.a = fneg <2 x half> %a
2833 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer)
2834 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
2836 store <2 x half> %med, ptr addrspace(1) %out.gep
2840 define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
2841 ; GFX6-LABEL: v_clamp_negabs_v2f16:
2843 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2844 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
2845 ; GFX6-NEXT: s_mov_b32 s6, 0
2846 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2847 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2848 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2849 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
2850 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2851 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
2852 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2853 ; GFX6-NEXT: v_or_b32_e32 v2, 0x80008000, v2
2854 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2855 ; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
2856 ; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp
2857 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
2858 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
2859 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2860 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
2861 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2862 ; GFX6-NEXT: s_endpgm
2864 ; GFX8-LABEL: v_clamp_negabs_v2f16:
2866 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2867 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2868 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2869 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2870 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2871 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2872 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2873 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2874 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2875 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2876 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2877 ; GFX8-NEXT: v_max_f16_sdwa v2, -|v3|, -|v3| clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2878 ; GFX8-NEXT: v_max_f16_e64 v3, -|v3|, -|v3| clamp
2879 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
2880 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2881 ; GFX8-NEXT: s_endpgm
2883 ; GFX9-LABEL: v_clamp_negabs_v2f16:
2885 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2886 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2887 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2888 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2889 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2890 ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
2891 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
2892 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2893 ; GFX9-NEXT: s_endpgm
2895 ; GFX11-LABEL: v_clamp_negabs_v2f16:
2897 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2898 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2899 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2900 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2901 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2902 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
2903 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2904 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
2905 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2906 ; GFX11-NEXT: s_nop 0
2907 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2908 ; GFX11-NEXT: s_endpgm
2909 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2910 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
2911 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
2912 %a = load <2 x half>, ptr addrspace(1) %gep0
2913 %fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
2914 %fneg.fabs.a = fneg <2 x half> %fabs.a
2916 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer)
2917 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
2919 store <2 x half> %med, ptr addrspace(1) %out.gep
2923 define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
2924 ; GFX6-LABEL: v_clamp_neglo_v2f16:
2926 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2927 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
2928 ; GFX6-NEXT: s_mov_b32 s6, 0
2929 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2930 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
2931 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2932 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
2933 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2934 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
2935 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2936 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2937 ; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
2938 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
2939 ; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 clamp
2940 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
2941 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
2942 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2943 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
2944 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2945 ; GFX6-NEXT: s_endpgm
2947 ; GFX8-LABEL: v_clamp_neglo_v2f16:
2949 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2950 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
2951 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2952 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2953 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2954 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2955 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
2956 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2957 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
2958 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2959 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2960 ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2961 ; GFX8-NEXT: v_max_f16_e64 v3, -v3, -v3 clamp
2962 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
2963 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2964 ; GFX8-NEXT: s_endpgm
2966 ; GFX9-LABEL: v_clamp_neglo_v2f16:
2968 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2969 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2970 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2971 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2972 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2973 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
2974 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2975 ; GFX9-NEXT: s_endpgm
2977 ; GFX11-LABEL: v_clamp_neglo_v2f16:
2979 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
2980 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2981 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2982 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2983 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2984 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
2985 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2986 ; GFX11-NEXT: s_nop 0
2987 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2988 ; GFX11-NEXT: s_endpgm
2989 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2990 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
2991 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
2992 %a = load <2 x half>, ptr addrspace(1) %gep0
2993 %lo = extractelement <2 x half> %a, i32 0
2994 %neg.lo = fneg half %lo
2995 %neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0
2996 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer)
2997 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
2999 store <2 x half> %med, ptr addrspace(1) %out.gep
3003 define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
3004 ; GFX6-LABEL: v_clamp_neghi_v2f16:
3006 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
3007 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
3008 ; GFX6-NEXT: s_mov_b32 s6, 0
3009 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3010 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
3011 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
3012 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
3013 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3014 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
3015 ; GFX6-NEXT: s_waitcnt vmcnt(0)
3016 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
3017 ; GFX6-NEXT: v_cvt_f32_f16_e64 v3, -v3 clamp
3018 ; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp
3019 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
3020 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
3021 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3022 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
3023 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3024 ; GFX6-NEXT: s_endpgm
3026 ; GFX8-LABEL: v_clamp_neghi_v2f16:
3028 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3029 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
3030 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3031 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
3032 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
3033 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3034 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
3035 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
3036 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
3037 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3038 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3039 ; GFX8-NEXT: v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3040 ; GFX8-NEXT: v_max_f16_e64 v3, v3, v3 clamp
3041 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
3042 ; GFX8-NEXT: flat_store_dword v[0:1], v2
3043 ; GFX8-NEXT: s_endpgm
3045 ; GFX9-LABEL: v_clamp_neghi_v2f16:
3047 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3048 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3049 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3050 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
3051 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3052 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
3053 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
3054 ; GFX9-NEXT: s_endpgm
3056 ; GFX11-LABEL: v_clamp_neghi_v2f16:
3058 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
3059 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3060 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3061 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
3062 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3063 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
3064 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
3065 ; GFX11-NEXT: s_nop 0
3066 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3067 ; GFX11-NEXT: s_endpgm
3068 %tid = call i32 @llvm.amdgcn.workitem.id.x()
3069 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3070 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3071 %a = load <2 x half>, ptr addrspace(1) %gep0
3072 %hi = extractelement <2 x half> %a, i32 1
3073 %neg.hi = fneg half %hi
3074 %neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1
3075 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer)
3076 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
3078 store <2 x half> %med, ptr addrspace(1) %out.gep
3082 define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
3083 ; GFX6-LABEL: v_clamp_v2f16_shuffle:
3085 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
3086 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
3087 ; GFX6-NEXT: s_mov_b32 s6, 0
3088 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3089 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
3090 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
3091 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
3092 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3093 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
3094 ; GFX6-NEXT: s_waitcnt vmcnt(0)
3095 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
3096 ; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp
3097 ; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
3098 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
3099 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
3100 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3101 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
3102 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3103 ; GFX6-NEXT: s_endpgm
3105 ; GFX8-LABEL: v_clamp_v2f16_shuffle:
3107 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3108 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
3109 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3110 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
3111 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
3112 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3113 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
3114 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
3115 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
3116 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3117 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3118 ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3119 ; GFX8-NEXT: v_max_f16_sdwa v3, v3, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3120 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
3121 ; GFX8-NEXT: flat_store_dword v[0:1], v2
3122 ; GFX8-NEXT: s_endpgm
3124 ; GFX9-LABEL: v_clamp_v2f16_shuffle:
3126 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3127 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3128 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3129 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
3130 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3131 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
3132 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
3133 ; GFX9-NEXT: s_endpgm
3135 ; GFX11-LABEL: v_clamp_v2f16_shuffle:
3137 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
3138 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3139 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3140 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
3141 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3142 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
3143 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
3144 ; GFX11-NEXT: s_nop 0
3145 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3146 ; GFX11-NEXT: s_endpgm
3147 %tid = call i32 @llvm.amdgcn.workitem.id.x()
3148 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3149 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3150 %a = load <2 x half>, ptr addrspace(1) %gep0
3151 %shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
3152 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer)
3153 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
3155 store <2 x half> %med, ptr addrspace(1) %out.gep
3159 define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
3160 ; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts0:
3162 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
3163 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
3164 ; GFX6-NEXT: s_mov_b32 s6, 0
3165 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3166 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
3167 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
3168 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
3169 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3170 ; GFX6-NEXT: s_mov_b32 s2, 0x7fc00000
3171 ; GFX6-NEXT: s_waitcnt vmcnt(0)
3172 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
3173 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
3174 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
3175 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
3176 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
3177 ; GFX6-NEXT: v_med3_f32 v3, v3, s2, 1.0
3178 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
3179 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, s2
3180 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
3181 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
3182 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
3183 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
3184 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3185 ; GFX6-NEXT: s_endpgm
3187 ; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts0:
3189 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3190 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
3191 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
3192 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3193 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
3194 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
3195 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3196 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
3197 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
3198 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
3199 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3200 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3201 ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3202 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
3203 ; GFX8-NEXT: v_max_f16_e32 v2, 0x7e00, v2
3204 ; GFX8-NEXT: v_max_f16_e32 v3, 0, v3
3205 ; GFX8-NEXT: v_min_f16_e32 v3, 0x7e00, v3
3206 ; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3207 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
3208 ; GFX8-NEXT: flat_store_dword v[0:1], v2
3209 ; GFX8-NEXT: s_endpgm
3211 ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0:
3213 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3214 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3215 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3216 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
3217 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3218 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
3219 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
3220 ; GFX9-NEXT: s_endpgm
3222 ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0:
3224 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
3225 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3226 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3227 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
3228 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3229 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
3230 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
3231 ; GFX11-NEXT: s_nop 0
3232 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3233 ; GFX11-NEXT: s_endpgm
3234 %tid = call i32 @llvm.amdgcn.workitem.id.x()
3235 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3236 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3237 %a = load <2 x half>, ptr addrspace(1) %gep0
3238 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>)
3239 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>)
3241 store <2 x half> %med, ptr addrspace(1) %out.gep
3245 define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
3246 ; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts1:
3248 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
3249 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
3250 ; GFX6-NEXT: s_mov_b32 s6, 0
3251 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3252 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
3253 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
3254 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
3255 ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3256 ; GFX6-NEXT: s_mov_b32 s2, 0x7fc00000
3257 ; GFX6-NEXT: s_waitcnt vmcnt(0)
3258 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
3259 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
3260 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
3261 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
3262 ; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
3263 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
3264 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, s2
3265 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
3266 ; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
3267 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
3268 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
3269 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3270 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
3271 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3272 ; GFX6-NEXT: s_endpgm
3274 ; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts1:
3276 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3277 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
3278 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
3279 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3280 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
3281 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
3282 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3283 ; GFX8-NEXT: flat_load_dword v3, v[0:1]
3284 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
3285 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
3286 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3287 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3288 ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3289 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
3290 ; GFX8-NEXT: v_max_f16_e32 v2, 0, v2
3291 ; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3
3292 ; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3
3293 ; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3294 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
3295 ; GFX8-NEXT: flat_store_dword v[0:1], v2
3296 ; GFX8-NEXT: s_endpgm
3298 ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1:
3300 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3301 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3303 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
3304 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3305 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
3306 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
3307 ; GFX9-NEXT: s_endpgm
3309 ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1:
3311 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
3312 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3313 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3314 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
3315 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3316 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
3317 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
3318 ; GFX11-NEXT: s_nop 0
3319 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3320 ; GFX11-NEXT: s_endpgm
3321 %tid = call i32 @llvm.amdgcn.workitem.id.x()
3322 %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
3323 %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
3324 %a = load <2 x half>, ptr addrspace(1) %gep0
3325 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
3326 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
3328 store <2 x half> %med, ptr addrspace(1) %out.gep
3332 define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0
3333 ; GFX6-LABEL: v_clamp_diff_source_f32:
3335 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
3336 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
3337 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
3338 ; GFX6-NEXT: s_load_dword s2, s[2:3], 0x2
3339 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
3340 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
3341 ; GFX6-NEXT: v_mov_b32_e32 v0, s5
3342 ; GFX6-NEXT: v_mov_b32_e32 v1, s2
3343 ; GFX6-NEXT: v_add_f32_e32 v0, s4, v0
3344 ; GFX6-NEXT: v_add_f32_e32 v1, s4, v1
3345 ; GFX6-NEXT: v_max_f32_e64 v0, v0, v1 clamp
3346 ; GFX6-NEXT: s_mov_b32 s2, -1
3347 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
3348 ; GFX6-NEXT: s_endpgm
3350 ; GFX8-LABEL: v_clamp_diff_source_f32:
3352 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3353 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3354 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
3355 ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x8
3356 ; GFX8-NEXT: s_add_u32 s0, s0, 12
3357 ; GFX8-NEXT: s_addc_u32 s1, s1, 0
3358 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3359 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
3360 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
3361 ; GFX8-NEXT: v_add_f32_e32 v0, s4, v0
3362 ; GFX8-NEXT: v_add_f32_e32 v1, s4, v1
3363 ; GFX8-NEXT: v_max_f32_e64 v2, v0, v1 clamp
3364 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
3365 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
3366 ; GFX8-NEXT: flat_store_dword v[0:1], v2
3367 ; GFX8-NEXT: s_endpgm
3369 ; GFX9-LABEL: v_clamp_diff_source_f32:
3371 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3372 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3373 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3374 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
3375 ; GFX9-NEXT: s_load_dword s6, s[2:3], 0x8
3376 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3377 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3378 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
3379 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v1
3380 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
3381 ; GFX9-NEXT: v_max_f32_e64 v1, v1, v2 clamp
3382 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:12
3383 ; GFX9-NEXT: s_endpgm
3385 ; GFX11-LABEL: v_clamp_diff_source_f32:
3387 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
3388 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
3389 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3390 ; GFX11-NEXT: s_clause 0x1
3391 ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
3392 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x8
3393 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3394 ; GFX11-NEXT: v_add_f32_e64 v0, s4, s5
3395 ; GFX11-NEXT: v_add_f32_e64 v1, s4, s2
3396 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3397 ; GFX11-NEXT: v_max_f32_e64 v0, v0, v1 clamp
3398 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] offset:12
3399 ; GFX11-NEXT: s_nop 0
3400 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3401 ; GFX11-NEXT: s_endpgm
3403 %gep1 = getelementptr float, ptr addrspace(1) %aptr, i32 1
3404 %gep2 = getelementptr float, ptr addrspace(1) %aptr, i32 2
3405 %l0 = load float, ptr addrspace(1) %aptr
3406 %l1 = load float, ptr addrspace(1) %gep1
3407 %l2 = load float, ptr addrspace(1) %gep2
3408 %a = fadd nsz float %l0, %l1
3409 %b = fadd nsz float %l0, %l2
3410 %res = call nsz float @llvm.maxnum.f32(float %a, float %b)
3411 %max = call nsz float @llvm.maxnum.f32(float %res, float 0.0)
3412 %min = call nsz float @llvm.minnum.f32(float %max, float 1.0)
3413 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 3
3414 store float %min, ptr addrspace(1) %out.gep
3418 declare i32 @llvm.amdgcn.workitem.id.x() #1
3419 declare float @llvm.fabs.f32(float) #1
3420 declare float @llvm.minnum.f32(float, float) #1
3421 declare float @llvm.maxnum.f32(float, float) #1
3422 declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
3423 declare double @llvm.fabs.f64(double) #1
3424 declare double @llvm.minnum.f64(double, double) #1
3425 declare double @llvm.maxnum.f64(double, double) #1
3426 declare half @llvm.fabs.f16(half) #1
3427 declare half @llvm.minnum.f16(half, half) #1
3428 declare half @llvm.maxnum.f16(half, half) #1
3429 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
3430 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
3431 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
3433 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
3434 attributes #1 = { nounwind readnone }
3435 attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
3436 attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
3437 attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }