1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
5 ; GCN-LABEL: {{^}}v_clamp_f32:
6 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
7 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
8 define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
9 %tid = call i32 @llvm.amdgcn.workitem.id.x()
10 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
11 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
12 %a = load float, float addrspace(1)* %gep0
13 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
14 %med = call float @llvm.minnum.f32(float %max, float 1.0)
16 store float %med, float addrspace(1)* %out.gep
20 ; GCN-LABEL: {{^}}v_clamp_neg_f32:
21 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
22 ; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
23 define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
24 %tid = call i32 @llvm.amdgcn.workitem.id.x()
25 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
26 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
27 %a = load float, float addrspace(1)* %gep0
28 %fneg.a = fsub float -0.0, %a
29 %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
30 %med = call float @llvm.minnum.f32(float %max, float 1.0)
32 store float %med, float addrspace(1)* %out.gep
36 ; GCN-LABEL: {{^}}v_clamp_negabs_f32:
37 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
38 ; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
39 define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
40 %tid = call i32 @llvm.amdgcn.workitem.id.x()
41 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
42 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
43 %a = load float, float addrspace(1)* %gep0
44 %fabs.a = call float @llvm.fabs.f32(float %a)
45 %fneg.fabs.a = fsub float -0.0, %fabs.a
47 %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
48 %med = call float @llvm.minnum.f32(float %max, float 1.0)
50 store float %med, float addrspace(1)* %out.gep
54 ; GCN-LABEL: {{^}}v_clamp_negzero_f32:
55 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
56 ; GCN-DAG: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1
57 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[SIGNBIT]], 1.0
58 define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
59 %tid = call i32 @llvm.amdgcn.workitem.id.x()
60 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
61 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
62 %a = load float, float addrspace(1)* %gep0
63 %max = call float @llvm.maxnum.f32(float %a, float -0.0)
64 %med = call float @llvm.minnum.f32(float %max, float 1.0)
66 store float %med, float addrspace(1)* %out.gep
70 ; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
71 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
72 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
73 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
74 define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
75 %tid = call i32 @llvm.amdgcn.workitem.id.x()
76 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
77 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
78 %a = load float, float addrspace(1)* %gep0
79 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
80 %med = call float @llvm.minnum.f32(float %max, float 1.0)
82 store float %med, float addrspace(1)* %out.gep
83 store volatile float %max, float addrspace(1)* undef
87 ; GCN-LABEL: {{^}}v_clamp_f16:
88 ; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
89 ; GFX89: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
91 ; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}}
92 ; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
93 define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
94 %tid = call i32 @llvm.amdgcn.workitem.id.x()
95 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
96 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
97 %a = load half, half addrspace(1)* %gep0
98 %max = call half @llvm.maxnum.f16(half %a, half 0.0)
99 %med = call half @llvm.minnum.f16(half %max, half 1.0)
101 store half %med, half addrspace(1)* %out.gep
105 ; GCN-LABEL: {{^}}v_clamp_neg_f16:
106 ; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
107 ; GFX89: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
109 ; FIXME: Better to fold neg into max
110 ; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}}
111 ; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
112 define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
113 %tid = call i32 @llvm.amdgcn.workitem.id.x()
114 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
115 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
116 %a = load half, half addrspace(1)* %gep0
117 %fneg.a = fsub half -0.0, %a
118 %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
119 %med = call half @llvm.minnum.f16(half %max, half 1.0)
121 store half %med, half addrspace(1)* %out.gep
125 ; GCN-LABEL: {{^}}v_clamp_negabs_f16:
126 ; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
127 ; GFX89: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
129 ; FIXME: Better to fold neg/abs into max
131 ; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}}
132 ; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
133 define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
134 %tid = call i32 @llvm.amdgcn.workitem.id.x()
135 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
136 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
137 %a = load half, half addrspace(1)* %gep0
138 %fabs.a = call half @llvm.fabs.f16(half %a)
139 %fneg.fabs.a = fsub half -0.0, %fabs.a
141 %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
142 %med = call half @llvm.minnum.f16(half %max, half 1.0)
144 store half %med, half addrspace(1)* %out.gep
148 ; FIXME: Do f64 instructions support clamp?
149 ; GCN-LABEL: {{^}}v_clamp_f64:
150 ; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
151 ; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}}
152 define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
153 %tid = call i32 @llvm.amdgcn.workitem.id.x()
154 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
155 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
156 %a = load double, double addrspace(1)* %gep0
157 %max = call double @llvm.maxnum.f64(double %a, double 0.0)
158 %med = call double @llvm.minnum.f64(double %max, double 1.0)
160 store double %med, double addrspace(1)* %out.gep
164 ; GCN-LABEL: {{^}}v_clamp_neg_f64:
165 ; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
166 ; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}}
167 define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
168 %tid = call i32 @llvm.amdgcn.workitem.id.x()
169 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
170 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
171 %a = load double, double addrspace(1)* %gep0
172 %fneg.a = fsub double -0.0, %a
173 %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
174 %med = call double @llvm.minnum.f64(double %max, double 1.0)
176 store double %med, double addrspace(1)* %out.gep
180 ; GCN-LABEL: {{^}}v_clamp_negabs_f64:
181 ; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
182 ; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}}
183 define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
184 %tid = call i32 @llvm.amdgcn.workitem.id.x()
185 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
186 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
187 %a = load double, double addrspace(1)* %gep0
188 %fabs.a = call double @llvm.fabs.f64(double %a)
189 %fneg.fabs.a = fsub double -0.0, %fabs.a
191 %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
192 %med = call double @llvm.minnum.f64(double %max, double 1.0)
194 store double %med, double addrspace(1)* %out.gep
198 ; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32:
199 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
201 define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
202 %tid = call i32 @llvm.amdgcn.workitem.id.x()
203 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
204 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
205 %a = load float, float addrspace(1)* %gep0
206 %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
207 store float %med, float addrspace(1)* %out.gep
211 ; GCN-LABEL: {{^}}v_clamp_med3_aby_f32:
212 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
213 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
214 define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
215 %tid = call i32 @llvm.amdgcn.workitem.id.x()
216 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
217 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
218 %a = load float, float addrspace(1)* %gep0
219 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
220 store float %med, float addrspace(1)* %out.gep
224 ; GCN-LABEL: {{^}}v_clamp_med3_bay_f32:
225 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
226 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
227 define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
228 %tid = call i32 @llvm.amdgcn.workitem.id.x()
229 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
230 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
231 %a = load float, float addrspace(1)* %gep0
232 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
233 store float %med, float addrspace(1)* %out.gep
237 ; GCN-LABEL: {{^}}v_clamp_med3_yab_f32:
238 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
239 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
240 define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
241 %tid = call i32 @llvm.amdgcn.workitem.id.x()
242 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
243 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
244 %a = load float, float addrspace(1)* %gep0
245 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
246 store float %med, float addrspace(1)* %out.gep
250 ; GCN-LABEL: {{^}}v_clamp_med3_yba_f32:
251 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
252 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
253 define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
254 %tid = call i32 @llvm.amdgcn.workitem.id.x()
255 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
256 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
257 %a = load float, float addrspace(1)* %gep0
258 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
259 store float %med, float addrspace(1)* %out.gep
263 ; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32:
264 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
265 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
266 define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
267 %tid = call i32 @llvm.amdgcn.workitem.id.x()
268 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
269 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
270 %a = load float, float addrspace(1)* %gep0
271 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
272 store float %med, float addrspace(1)* %out.gep
276 ; GCN-LABEL: {{^}}v_clamp_med3_bya_f32:
277 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
278 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
279 define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
280 %tid = call i32 @llvm.amdgcn.workitem.id.x()
281 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
282 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
283 %a = load float, float addrspace(1)* %gep0
284 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
285 store float %med, float addrspace(1)* %out.gep
289 ; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32:
290 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0
291 define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 {
292 %tid = call i32 @llvm.amdgcn.workitem.id.x()
293 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
294 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
295 store float %med, float addrspace(1)* %out.gep
299 ; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32:
300 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
301 define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 {
302 %tid = call i32 @llvm.amdgcn.workitem.id.x()
303 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
304 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
305 store float %med, float addrspace(1)* %out.gep
309 ; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32:
310 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5
311 define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 {
312 %tid = call i32 @llvm.amdgcn.workitem.id.x()
313 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
314 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
315 store float %med, float addrspace(1)* %out.gep
319 ; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32:
320 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}}
321 define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 {
322 %tid = call i32 @llvm.amdgcn.workitem.id.x()
323 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
324 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
325 store float %med, float addrspace(1)* %out.gep
329 ; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32:
330 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
331 define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 {
332 %tid = call i32 @llvm.amdgcn.workitem.id.x()
333 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
334 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
335 store float %med, float addrspace(1)* %out.gep
339 ; GCN-LABEL: {{^}}v_clamp_constant_snan_f32:
340 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
341 define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 {
342 %tid = call i32 @llvm.amdgcn.workitem.id.x()
343 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
344 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
345 store float %med, float addrspace(1)* %out.gep
349 ; ---------------------------------------------------------------------
350 ; Test non-default behaviors enabling snans and disabling dx10_clamp
351 ; ---------------------------------------------------------------------
353 ; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp:
354 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
355 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
356 define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
357 %tid = call i32 @llvm.amdgcn.workitem.id.x()
358 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
359 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
360 %a = load float, float addrspace(1)* %gep0
361 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
362 %med = call float @llvm.minnum.f32(float %max, float 1.0)
364 store float %med, float addrspace(1)* %out.gep
368 ; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp:
369 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
370 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
371 define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 {
372 %tid = call i32 @llvm.amdgcn.workitem.id.x()
373 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
374 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
375 %a = load float, float addrspace(1)* %gep0
376 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
377 %med = call float @llvm.minnum.f32(float %max, float 1.0)
379 store float %med, float addrspace(1)* %out.gep
383 ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
384 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
385 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
386 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
387 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
388 %tid = call i32 @llvm.amdgcn.workitem.id.x()
389 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
390 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
391 %a = load float, float addrspace(1)* %gep0
392 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
393 %med = call float @llvm.minnum.f32(float %max, float 1.0)
395 store float %med, float addrspace(1)* %out.gep
399 ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
400 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
401 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
402 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
403 %tid = call i32 @llvm.amdgcn.workitem.id.x()
404 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
405 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
406 %a = load float, float addrspace(1)* %gep0
407 %add = fadd nnan float %a, 1.0
408 %max = call float @llvm.maxnum.f32(float %add, float 0.0)
409 %med = call float @llvm.minnum.f32(float %max, float 1.0)
411 store float %med, float addrspace(1)* %out.gep
415 ; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp:
416 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
417 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
418 define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
419 %tid = call i32 @llvm.amdgcn.workitem.id.x()
420 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
421 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
422 %a = load float, float addrspace(1)* %gep0
423 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
424 store float %med, float addrspace(1)* %out.gep
428 ; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp:
429 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
430 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
431 define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
432 %tid = call i32 @llvm.amdgcn.workitem.id.x()
433 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
434 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
435 %a = load float, float addrspace(1)* %gep0
436 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
437 store float %med, float addrspace(1)* %out.gep
441 ; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp:
442 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
443 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
444 define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
445 %tid = call i32 @llvm.amdgcn.workitem.id.x()
446 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
447 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
448 %a = load float, float addrspace(1)* %gep0
449 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
450 store float %med, float addrspace(1)* %out.gep
454 ; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp:
455 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
456 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0
457 define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
458 %tid = call i32 @llvm.amdgcn.workitem.id.x()
459 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
460 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
461 %a = load float, float addrspace(1)* %gep0
462 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
463 store float %med, float addrspace(1)* %out.gep
467 ; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp:
468 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
469 ; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0
470 define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
471 %tid = call i32 @llvm.amdgcn.workitem.id.x()
472 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
473 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
474 %a = load float, float addrspace(1)* %gep0
475 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
476 store float %med, float addrspace(1)* %out.gep
480 ; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp:
481 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
482 ; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0
483 define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
484 %tid = call i32 @llvm.amdgcn.workitem.id.x()
485 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
486 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
487 %a = load float, float addrspace(1)* %gep0
488 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
489 store float %med, float addrspace(1)* %out.gep
493 ; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp:
494 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000
495 define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
496 %tid = call i32 @llvm.amdgcn.workitem.id.x()
497 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
498 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
499 store float %med, float addrspace(1)* %out.gep
503 ; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp:
504 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001
505 define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
506 %tid = call i32 @llvm.amdgcn.workitem.id.x()
507 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
508 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
509 store float %med, float addrspace(1)* %out.gep
513 ; GCN-LABEL: {{^}}v_clamp_v2f16:
514 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
516 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
517 define amdgpu_kernel void @v_clamp_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
518 %tid = call i32 @llvm.amdgcn.workitem.id.x()
519 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
520 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
521 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
522 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer)
523 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
525 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
529 ; GCN-LABEL: {{^}}v_clamp_v2f16_undef_elt:
530 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
532 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
533 define amdgpu_kernel void @v_clamp_v2f16_undef_elt(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
534 %tid = call i32 @llvm.amdgcn.workitem.id.x()
535 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
536 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
537 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
538 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
539 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
541 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
545 ; GCN-LABEL: {{^}}v_clamp_v2f16_not_zero:
548 define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
549 %tid = call i32 @llvm.amdgcn.workitem.id.x()
550 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
551 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
552 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
553 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>)
554 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
556 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
560 ; GCN-LABEL: {{^}}v_clamp_v2f16_not_one:
563 define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
564 %tid = call i32 @llvm.amdgcn.workitem.id.x()
565 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
566 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
567 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
568 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>)
569 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>)
571 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
575 ; GCN-LABEL: {{^}}v_clamp_neg_v2f16:
576 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
578 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
579 define amdgpu_kernel void @v_clamp_neg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
580 %tid = call i32 @llvm.amdgcn.workitem.id.x()
581 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
582 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
583 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
584 %fneg.a = fsub <2 x half> <half -0.0, half -0.0>, %a
585 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer)
586 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
588 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
592 ; GCN-LABEL: {{^}}v_clamp_negabs_v2f16:
593 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
594 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, [[A]]
595 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
596 define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
597 %tid = call i32 @llvm.amdgcn.workitem.id.x()
598 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
599 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
600 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
601 %fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
602 %fneg.fabs.a = fsub <2 x half> <half -0.0, half -0.0>, %fabs.a
604 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer)
605 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
607 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
611 ; GCN-LABEL: {{^}}v_clamp_neglo_v2f16:
612 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
614 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] clamp{{$}}
615 define amdgpu_kernel void @v_clamp_neglo_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
616 %tid = call i32 @llvm.amdgcn.workitem.id.x()
617 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
618 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
619 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
620 %lo = extractelement <2 x half> %a, i32 0
621 %neg.lo = fsub half -0.0, %lo
622 %neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0
623 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer)
624 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
626 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
630 ; GCN-LABEL: {{^}}v_clamp_neghi_v2f16:
631 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
633 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_hi:[1,1] clamp{{$}}
634 define amdgpu_kernel void @v_clamp_neghi_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
635 %tid = call i32 @llvm.amdgcn.workitem.id.x()
636 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
637 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
638 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
639 %hi = extractelement <2 x half> %a, i32 1
640 %neg.hi = fsub half -0.0, %hi
641 %neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1
642 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer)
643 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
645 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
649 ; GCN-LABEL: {{^}}v_clamp_v2f16_shuffle:
650 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
652 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] op_sel:[1,1] op_sel_hi:[0,0] clamp{{$}}
653 define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
654 %tid = call i32 @llvm.amdgcn.workitem.id.x()
655 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
656 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
657 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
658 %shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
659 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer)
660 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
662 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
666 declare i32 @llvm.amdgcn.workitem.id.x() #1
667 declare float @llvm.fabs.f32(float) #1
668 declare float @llvm.minnum.f32(float, float) #1
669 declare float @llvm.maxnum.f32(float, float) #1
670 declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
671 declare double @llvm.fabs.f64(double) #1
672 declare double @llvm.minnum.f64(double, double) #1
673 declare double @llvm.maxnum.f64(double, double) #1
674 declare half @llvm.fabs.f16(half) #1
675 declare half @llvm.minnum.f16(half, half) #1
676 declare half @llvm.maxnum.f16(half, half) #1
677 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
678 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
679 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
681 attributes #0 = { nounwind }
682 attributes #1 = { nounwind readnone }
683 attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" }
684 attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
685 attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }