test/CodeGen/AMDGPU/clamp.ll

   1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
   2 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI %s
   3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
   4
   5 ; GCN-LABEL: {{^}}v_clamp_f32:
   6 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
   7 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
   8 define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
   9   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  10   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
  11   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
  12   %a = load float, float addrspace(1)* %gep0
  13   %max = call float @llvm.maxnum.f32(float %a, float 0.0)
  14   %med = call float @llvm.minnum.f32(float %max, float 1.0)
  15
  16   store float %med, float addrspace(1)* %out.gep
  17   ret void
  18 }
  19
  20 ; GCN-LABEL: {{^}}v_clamp_neg_f32:
  21 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
  22 ; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
  23 define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
  24   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  25   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
  26   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
  27   %a = load float, float addrspace(1)* %gep0
  28   %fneg.a = fsub float -0.0, %a
  29   %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
  30   %med = call float @llvm.minnum.f32(float %max, float 1.0)
  31
  32   store float %med, float addrspace(1)* %out.gep
  33   ret void
  34 }
  35
  36 ; GCN-LABEL: {{^}}v_clamp_negabs_f32:
  37 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
  38 ; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
  39 define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
  40   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  41   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
  42   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
  43   %a = load float, float addrspace(1)* %gep0
  44   %fabs.a = call float @llvm.fabs.f32(float %a)
  45   %fneg.fabs.a = fsub float -0.0, %fabs.a
  46
  47   %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
  48   %med = call float @llvm.minnum.f32(float %max, float 1.0)
  49
  50   store float %med, float addrspace(1)* %out.gep
  51   ret void
  52 }
  53
  54 ; GCN-LABEL: {{^}}v_clamp_negzero_f32:
  55 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
  56 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]]
  57 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[ADD]]
  58 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 1.0, [[MAX]]
  59 define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
  60   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  61   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
  62   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
  63   %a = load float, float addrspace(1)* %gep0
  64   %add = fadd nnan float %a, 0.5
  65   %max = call float @llvm.maxnum.f32(float %add, float -0.0)
  66   %med = call float @llvm.minnum.f32(float %max, float 1.0)
  67
  68   store float %med, float addrspace(1)* %out.gep
  69   ret void
  70 }
  71
  72 ; FIXME: Weird inconsistency in how -0.0 is treated. Accepted if clamp
  73 ; matched through med3, not if directly. Is this correct?
  74
  75 ; GCN-LABEL: {{^}}v_clamp_negzero_maybe_snan_f32:
  76 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
  77 ; GCN: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
  78 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[QUIET]]
  79 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
  80 define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
  81   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  82   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
  83   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
  84   %a = load float, float addrspace(1)* %gep0
  85   %max = call float @llvm.maxnum.f32(float %a, float -0.0)
  86   %med = call float @llvm.minnum.f32(float %max, float 1.0)
  87
  88   store float %med, float addrspace(1)* %out.gep
  89   ret void
  90 }
  91
  92 ; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
  93 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
  94 ; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
  95 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
  96 ; GCN: v_min_f32_e32 [[MED:v[0-9]+]], 1.0, [[QUIET_A]]
  97 ; GCN-NOT: [[MAX]]
  98 ; GCN-NOT: [[MED]]
  99
 100 ; SI: buffer_store_dword [[MED]]
 101 ; SI: buffer_store_dword [[MAX]]
 102
 103 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MED]]
 104 ; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX]]
 105 define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
 106   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 107   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 108   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 109   %a = load float, float addrspace(1)* %gep0
 110   %max = call float @llvm.maxnum.f32(float %a, float 0.0)
 111   %med = call float @llvm.minnum.f32(float %max, float 1.0)
 112
 113   store float %med, float addrspace(1)* %out.gep
 114   store volatile float %max, float addrspace(1)* undef
 115   ret void
 116 }
 117
 118 ; GCN-LABEL: {{^}}v_clamp_f16:
 119 ; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
 120 ; GFX89: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
 121
 122 ; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}}
 123 ; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
 124 define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
 125   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 126   %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
 127   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
 128   %a = load half, half addrspace(1)* %gep0
 129   %max = call half @llvm.maxnum.f16(half %a, half 0.0)
 130   %med = call half @llvm.minnum.f16(half %max, half 1.0)
 131
 132   store half %med, half addrspace(1)* %out.gep
 133   ret void
 134 }
 135
 136 ; GCN-LABEL: {{^}}v_clamp_neg_f16:
 137 ; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
 138 ; GFX89: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
 139
 140 ; FIXME: Better to fold neg into max
 141 ; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}}
 142 ; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
 143 define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
 144   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 145   %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
 146   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
 147   %a = load half, half addrspace(1)* %gep0
 148   %fneg.a = fsub half -0.0, %a
 149   %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
 150   %med = call half @llvm.minnum.f16(half %max, half 1.0)
 151
 152   store half %med, half addrspace(1)* %out.gep
 153   ret void
 154 }
 155
 156 ; GCN-LABEL: {{^}}v_clamp_negabs_f16:
 157 ; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
 158 ; GFX89: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
 159
 160 ; FIXME: Better to fold neg/abs into max
 161
 162 ; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}}
 163 ; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
 164 define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
 165   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 166   %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
 167   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
 168   %a = load half, half addrspace(1)* %gep0
 169   %fabs.a = call half @llvm.fabs.f16(half %a)
 170   %fneg.fabs.a = fsub half -0.0, %fabs.a
 171
 172   %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
 173   %med = call half @llvm.minnum.f16(half %max, half 1.0)
 174
 175   store half %med, half addrspace(1)* %out.gep
 176   ret void
 177 }
 178
 179 ; FIXME: Do f64 instructions support clamp?
 180 ; GCN-LABEL: {{^}}v_clamp_f64:
 181 ; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 182 ; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}}
 183 define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
 184   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 185   %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
 186   %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
 187   %a = load double, double addrspace(1)* %gep0
 188   %max = call double @llvm.maxnum.f64(double %a, double 0.0)
 189   %med = call double @llvm.minnum.f64(double %max, double 1.0)
 190
 191   store double %med, double addrspace(1)* %out.gep
 192   ret void
 193 }
 194
 195 ; GCN-LABEL: {{^}}v_clamp_neg_f64:
 196 ; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 197 ; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}}
 198 define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
 199   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 200   %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
 201   %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
 202   %a = load double, double addrspace(1)* %gep0
 203   %fneg.a = fsub double -0.0, %a
 204   %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
 205   %med = call double @llvm.minnum.f64(double %max, double 1.0)
 206
 207   store double %med, double addrspace(1)* %out.gep
 208   ret void
 209 }
 210
 211 ; GCN-LABEL: {{^}}v_clamp_negabs_f64:
 212 ; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 213 ; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}}
 214 define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
 215   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 216   %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
 217   %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
 218   %a = load double, double addrspace(1)* %gep0
 219   %fabs.a = call double @llvm.fabs.f64(double %a)
 220   %fneg.fabs.a = fsub double -0.0, %fabs.a
 221
 222   %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
 223   %med = call double @llvm.minnum.f64(double %max, double 1.0)
 224
 225   store double %med, double addrspace(1)* %out.gep
 226   ret void
 227 }
 228
 229 ; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32:
 230 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 231 ; GCN: v_med3_f32
 232 define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
 233   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 234   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 235   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 236   %a = load float, float addrspace(1)* %gep0
 237   %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
 238   store float %med, float addrspace(1)* %out.gep
 239   ret void
 240 }
 241
 242 ; GCN-LABEL: {{^}}v_clamp_med3_aby_f32:
 243 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 244 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
 245 define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
 246   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 247   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 248   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 249   %a = load float, float addrspace(1)* %gep0
 250   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
 251   store float %med, float addrspace(1)* %out.gep
 252   ret void
 253 }
 254
 255 ; GCN-LABEL: {{^}}v_clamp_med3_bay_f32:
 256 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 257 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
 258 define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
 259   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 260   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 261   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 262   %a = load float, float addrspace(1)* %gep0
 263   %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
 264   store float %med, float addrspace(1)* %out.gep
 265   ret void
 266 }
 267
 268 ; GCN-LABEL: {{^}}v_clamp_med3_yab_f32:
 269 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 270 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
 271 define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
 272   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 273   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 274   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 275   %a = load float, float addrspace(1)* %gep0
 276   %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
 277   store float %med, float addrspace(1)* %out.gep
 278   ret void
 279 }
 280
 281 ; GCN-LABEL: {{^}}v_clamp_med3_yba_f32:
 282 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 283 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
 284 define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
 285   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 286   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 287   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 288   %a = load float, float addrspace(1)* %gep0
 289   %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
 290   store float %med, float addrspace(1)* %out.gep
 291   ret void
 292 }
 293
 294 ; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32:
 295 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 296 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
 297 define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
 298   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 299   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 300   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 301   %a = load float, float addrspace(1)* %gep0
 302   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
 303   store float %med, float addrspace(1)* %out.gep
 304   ret void
 305 }
 306
 307 ; GCN-LABEL: {{^}}v_clamp_med3_bya_f32:
 308 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 309 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
 310 define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
 311   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 312   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 313   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 314   %a = load float, float addrspace(1)* %gep0
 315   %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
 316   store float %med, float addrspace(1)* %out.gep
 317   ret void
 318 }
 319
 320 ; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32:
 321 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0
 322 define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 {
 323   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 324   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 325   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
 326   store float %med, float addrspace(1)* %out.gep
 327   ret void
 328 }
 329
 330 ; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32:
 331 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 332 define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 {
 333   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 334   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 335   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
 336   store float %med, float addrspace(1)* %out.gep
 337   ret void
 338 }
 339
 340 ; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32:
 341 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5
 342 define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 {
 343   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 344   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 345   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
 346   store float %med, float addrspace(1)* %out.gep
 347   ret void
 348 }
 349
 350 ; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32:
 351 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}}
 352 define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 {
 353   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 354   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 355   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
 356   store float %med, float addrspace(1)* %out.gep
 357   ret void
 358 }
 359
 360 ; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32:
 361 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 362 define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 {
 363   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 364   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 365   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
 366   store float %med, float addrspace(1)* %out.gep
 367   ret void
 368 }
 369
 370 ; GCN-LABEL: {{^}}v_clamp_constant_snan_f32:
 371 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 372 define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 {
 373   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 374   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 375   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
 376   store float %med, float addrspace(1)* %out.gep
 377   ret void
 378 }
 379
 380 ; ---------------------------------------------------------------------
 381 ; Test non-default behaviors enabling snans and disabling dx10_clamp
 382 ; ---------------------------------------------------------------------
 383
 384 ; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp:
 385 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 386 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]]
 387 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
 388 define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
 389   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 390   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 391   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 392   %a = load float, float addrspace(1)* %gep0
 393   %a.nnan = fadd nnan float %a, 0.5
 394   %max = call float @llvm.maxnum.f32(float %a.nnan, float 0.0)
 395   %med = call float @llvm.minnum.f32(float %max, float 1.0)
 396
 397   store float %med, float addrspace(1)* %out.gep
 398   ret void
 399 }
 400
 401 ; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp:
 402 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 403 ; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 0.5 clamp{{$}}
 404 define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 {
 405   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 406   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 407   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 408   %a = load float, float addrspace(1)* %gep0
 409   %add = fadd float %a, 0.5
 410   %max = call float @llvm.maxnum.f32(float %add, float 0.0)
 411   %med = call float @llvm.minnum.f32(float %max, float 1.0)
 412
 413   store float %med, float addrspace(1)* %out.gep
 414   ret void
 415 }
 416
 417 ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
 418 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 419 ; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
 420 ; GCN: v_med3_f32 {{v[0-9]+}}, [[QUIET_A]], 0, 1.0
 421 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
 422   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 423   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 424   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 425   %a = load float, float addrspace(1)* %gep0
 426   %max = call float @llvm.maxnum.f32(float %a, float 0.0)
 427   %med = call float @llvm.minnum.f32(float %max, float 1.0)
 428
 429   store float %med, float addrspace(1)* %out.gep
 430   ret void
 431 }
 432
 433 ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
 434 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 435 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]
 436 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
 437 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
 438   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 439   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 440   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 441   %a = load float, float addrspace(1)* %gep0
 442   %add  = fadd nnan float %a, 1.0
 443   %max = call float @llvm.maxnum.f32(float %add, float 0.0)
 444   %med = call float @llvm.minnum.f32(float %max, float 1.0)
 445
 446   store float %med, float addrspace(1)* %out.gep
 447   ret void
 448 }
 449
 450 ; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp:
 451 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 452 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
 453 define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
 454   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 455   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 456   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 457   %a = load float, float addrspace(1)* %gep0
 458   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
 459   store float %med, float addrspace(1)* %out.gep
 460   ret void
 461 }
 462
 463 ; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp:
 464 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 465 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
 466 define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
 467   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 468   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 469   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 470   %a = load float, float addrspace(1)* %gep0
 471   %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
 472   store float %med, float addrspace(1)* %out.gep
 473   ret void
 474 }
 475
 476 ; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp:
 477 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 478 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
 479 define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
 480   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 481   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 482   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 483   %a = load float, float addrspace(1)* %gep0
 484   %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
 485   store float %med, float addrspace(1)* %out.gep
 486   ret void
 487 }
 488
 489 ; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp:
 490 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 491 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0
 492 define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
 493   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 494   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 495   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 496   %a = load float, float addrspace(1)* %gep0
 497   %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
 498   store float %med, float addrspace(1)* %out.gep
 499   ret void
 500 }
 501
 502 ; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp:
 503 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 504 ; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0
 505 define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
 506   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 507   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 508   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 509   %a = load float, float addrspace(1)* %gep0
 510   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
 511   store float %med, float addrspace(1)* %out.gep
 512   ret void
 513 }
 514
 515 ; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp:
 516 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 517 ; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0
 518 define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
 519   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 520   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
 521   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 522   %a = load float, float addrspace(1)* %gep0
 523   %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
 524   store float %med, float addrspace(1)* %out.gep
 525   ret void
 526 }
 527
 528 ; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp:
 529 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000
 530 define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
 531   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 532   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 533   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
 534   store float %med, float addrspace(1)* %out.gep
 535   ret void
 536 }
 537
 538 ; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp:
 539 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001
 540 define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
 541   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 542   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 543   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
 544   store float %med, float addrspace(1)* %out.gep
 545   ret void
 546 }
 547
 548 ; GCN-LABEL: {{^}}v_clamp_v2f16:
 549 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 550 ; GFX9-NOT: [[A]]
 551 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
 552 define amdgpu_kernel void @v_clamp_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
 553   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 554   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
 555   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
 556   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
 557   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer)
 558   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
 559
 560   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
 561   ret void
 562 }
 563
 564 ; GCN-LABEL: {{^}}v_clamp_v2f16_undef_elt:
 565 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 566 ; GFX9-NOT: [[A]]
 567 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
 568 define amdgpu_kernel void @v_clamp_v2f16_undef_elt(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
 569   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 570   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
 571   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
 572   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
 573   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
 574   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
 575
 576   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
 577   ret void
 578 }
 579
 580 ; GCN-LABEL: {{^}}v_clamp_v2f16_not_zero:
 581 ; GFX9: v_pk_max_f16
 582 ; GFX9: v_pk_min_f16
 583 define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
 584   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 585   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
 586   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
 587   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
 588   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>)
 589   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
 590
 591   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
 592   ret void
 593 }
 594
 595 ; GCN-LABEL: {{^}}v_clamp_v2f16_not_one:
 596 ; GFX9: v_pk_max_f16
 597 ; GFX9: v_pk_min_f16
 598 define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
 599   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 600   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
 601   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
 602   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
 603   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>)
 604   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>)
 605
 606   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
 607   ret void
 608 }
 609
 610 ; GCN-LABEL: {{^}}v_clamp_neg_v2f16:
 611 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 612 ; GFX9-NOT: [[A]]
 613 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
 614 define amdgpu_kernel void @v_clamp_neg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
 615   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 616   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
 617   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
 618   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
 619   %fneg.a = fsub <2 x half> <half -0.0, half -0.0>, %a
 620   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer)
 621   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
 622
 623   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
 624   ret void
 625 }
 626
 627 ; GCN-LABEL: {{^}}v_clamp_negabs_v2f16:
 628 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 629 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, [[A]]
 630 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
 631 define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
 632   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 633   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
 634   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
 635   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
 636   %fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
 637   %fneg.fabs.a = fsub <2 x half> <half -0.0, half -0.0>, %fabs.a
 638
 639   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer)
 640   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
 641
 642   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
 643   ret void
 644 }
 645
 646 ; GCN-LABEL: {{^}}v_clamp_neglo_v2f16:
 647 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 648 ; GFX9-NOT: [[A]]
 649 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] clamp{{$}}
 650 define amdgpu_kernel void @v_clamp_neglo_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
 651   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 652   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
 653   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
 654   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
 655   %lo = extractelement <2 x half> %a, i32 0
 656   %neg.lo = fsub half -0.0, %lo
 657   %neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0
 658   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer)
 659   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
 660
 661   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
 662   ret void
 663 }
 664
 665 ; GCN-LABEL: {{^}}v_clamp_neghi_v2f16:
 666 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 667 ; GFX9-NOT: [[A]]
 668 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_hi:[1,1] clamp{{$}}
 669 define amdgpu_kernel void @v_clamp_neghi_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
 670   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 671   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
 672   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
 673   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
 674   %hi = extractelement <2 x half> %a, i32 1
 675   %neg.hi = fsub half -0.0, %hi
 676   %neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1
 677   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer)
 678   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
 679
 680   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
 681   ret void
 682 }
 683
 684 ; GCN-LABEL: {{^}}v_clamp_v2f16_shuffle:
 685 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 686 ; GFX9-NOT: [[A]]
 687 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] op_sel:[1,1] op_sel_hi:[0,0] clamp{{$}}
 688 define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
 689   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 690   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
 691   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
 692   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
 693   %shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 694   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer)
 695   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
 696
 697   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
 698   ret void
 699 }
 700
 701 ; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts0:
 702 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 703 ; GFX9-NOT: [[A]]
 704 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
 705 define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
 706   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 707   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
 708   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
 709   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
 710   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>)
 711   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>)
 712
 713   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
 714   ret void
 715 }
 716
 717 ; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts1:
 718 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
 719 ; GFX9-NOT: [[A]]
 720 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
 721 define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
 722   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 723   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
 724   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
 725   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
 726   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
 727   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
 728
 729   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
 730   ret void
 731 }
 732
 733 ; GCN-LABEL: {{^}}v_clamp_diff_source_f32:
 734 ; GCN: v_add_f32_e32 [[A:v[0-9]+]]
 735 ; GCN: v_add_f32_e32 [[B:v[0-9]+]]
 736 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[B]] clamp{{$}}
 737 define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0
 738 {
 739   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 0
 740   %gep1 = getelementptr float, float addrspace(1)* %aptr, i32 1
 741   %gep2 = getelementptr float, float addrspace(1)* %aptr, i32 2
 742   %l0 = load float, float addrspace(1)* %gep0
 743   %l1 = load float, float addrspace(1)* %gep1
 744   %l2 = load float, float addrspace(1)* %gep2
 745   %a = fadd nsz float %l0, %l1
 746   %b = fadd nsz float %l0, %l2
 747   %res = call nsz float @llvm.maxnum.f32(float %a, float %b)
 748   %max = call nsz float @llvm.maxnum.f32(float %res, float 0.0)
 749   %min = call nsz float @llvm.minnum.f32(float %max, float 1.0)
 750   %out.gep = getelementptr float, float addrspace(1)* %out, i32 3
 751   store float %min, float addrspace(1)* %out.gep
 752   ret void
 753 }
 754
 755 declare i32 @llvm.amdgcn.workitem.id.x() #1
 756 declare float @llvm.fabs.f32(float) #1
 757 declare float @llvm.minnum.f32(float, float) #1
 758 declare float @llvm.maxnum.f32(float, float) #1
 759 declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
 760 declare double @llvm.fabs.f64(double) #1
 761 declare double @llvm.minnum.f64(double, double) #1
 762 declare double @llvm.maxnum.f64(double, double) #1
 763 declare half @llvm.fabs.f16(half) #1
 764 declare half @llvm.minnum.f16(half, half) #1
 765 declare half @llvm.maxnum.f16(half, half) #1
 766 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
 767 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
 768 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
 769
 770 attributes #0 = { nounwind }
 771 attributes #1 = { nounwind readnone }
 772 attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="-fp-exceptions" "no-nans-fp-math"="false" }
 773 attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "target-features"="+fp-exceptions" "no-nans-fp-math"="false" }
 774 attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="+fp-exceptions" "no-nans-fp-math"="false" }