test/CodeGen/AMDGPU/fcanonicalize-elimination.ll

   1 ; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s
   2 ; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,VI-FLUSH,GCN-FLUSH %s
   3 ; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM,GCN-NOEXCEPT %s
   4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM,GCN-NOEXCEPT %s
   5 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s
   6
   7 ; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
   8 ; GCN-FLUSH:   v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
   9 ; GFX9-DENORM: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
  10 define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) {
  11   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  12   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
  13   %v = load float, float addrspace(1)* %gep, align 4
  14   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
  15   store float %canonicalized, float addrspace(1)* %gep, align 4
  16   ret void
  17 }
  18
  19 ; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32:
  20 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
  21 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
  22 ; GCN-NOT: 1.0
  23 define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) {
  24   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  25   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
  26   %load = load float, float addrspace(1)* %gep, align 4
  27   %v = fmul float %load, 15.0
  28   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
  29   store float %canonicalized, float addrspace(1)* %gep, align 4
  30   ret void
  31 }
  32
  33 ; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_legacy_value_f32:
  34 ; GCN: v_mul_legacy_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
  35 ; GCN-NOT: v_mul
  36 ; GCN-NOT: v_max
  37 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
  38 define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(float addrspace(1)* %arg) {
  39   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  40   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
  41   %load = load float, float addrspace(1)* %gep, align 4
  42   %v = call float @llvm.amdgcn.fmul.legacy(float %load, float 15.0)
  43   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
  44   store float %canonicalized, float addrspace(1)* %gep, align 4
  45   ret void
  46 }
  47
  48 ; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32:
  49 ; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
  50 ; GCN-NOT: v_mul
  51 ; GCN-NOT: v_max
  52 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
  53 define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) {
  54   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  55   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
  56   %load = load float, float addrspace(1)* %gep, align 4
  57   %v = fsub float 15.0, %load
  58   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
  59   store float %canonicalized, float addrspace(1)* %gep, align 4
  60   ret void
  61 }
  62
  63 ; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32:
  64 ; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
  65 ; GCN-NOT: v_mul
  66 ; GCN-NOT: v_max
  67 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
  68 define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) {
  69   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  70   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
  71   %load = load float, float addrspace(1)* %gep, align 4
  72   %v = fadd float %load, 15.0
  73   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
  74   store float %canonicalized, float addrspace(1)* %gep, align 4
  75   ret void
  76 }
  77
  78 ; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32:
  79 ; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
  80 ; GCN-NOT: v_mul
  81 ; GCN-NOT: v_max
  82 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
  83 define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) {
  84   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  85   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
  86   %load = load float, float addrspace(1)* %gep, align 4
  87   %v = call float @llvm.sqrt.f32(float %load)
  88   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
  89   store float %canonicalized, float addrspace(1)* %gep, align 4
  90   ret void
  91 }
  92
  93 ; GCN-LABEL: test_fold_canonicalize_fceil_value_f32:
  94 ; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
  95 ; GCN-NOT: v_mul
  96 ; GCN-NOT: v_max
  97 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
  98 define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) {
  99   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 100   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 101   %load = load float, float addrspace(1)* %gep, align 4
 102   %v = call float @llvm.ceil.f32(float %load)
 103   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 104   store float %canonicalized, float addrspace(1)* %gep, align 4
 105   ret void
 106 }
 107
 108 ; GCN-LABEL: test_fold_canonicalize_floor_value_f32:
 109 ; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 110 ; GCN-NOT: v_mul
 111 ; GCN-NOT: v_max
 112 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 113 define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) {
 114   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 115   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 116   %load = load float, float addrspace(1)* %gep, align 4
 117   %v = call float @llvm.floor.f32(float %load)
 118   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 119   store float %canonicalized, float addrspace(1)* %gep, align 4
 120   ret void
 121 }
 122
 123 ; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
 124 ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
 125 ; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
 126 ; GCN-NOT: v_mul
 127 ; GCN-NOT: v_max
 128 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 129 define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) {
 130   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 131   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 132   %load = load float, float addrspace(1)* %gep, align 4
 133   %v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0)
 134   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 135   store float %canonicalized, float addrspace(1)* %gep, align 4
 136   ret void
 137 }
 138
 139 ; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
 140 ; GCN: s_mov_b32 [[SGPR:s[0-9]+]], 0x41700000
 141 ; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SGPR]], [[SGPR]]
 142 ; GCN-NOT: v_mul
 143 ; GCN-NOT: v_max
 144 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 145 define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(float addrspace(1)* %arg) {
 146   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 147   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 148   %load = load float, float addrspace(1)* %gep, align 4
 149   %v = call float @llvm.amdgcn.fmad.ftz.f32(float %load, float 15.0, float 15.0)
 150   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 151   store float %canonicalized, float addrspace(1)* %gep, align 4
 152   ret void
 153 }
 154
 155 ; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
 156 ; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 157 ; GCN-DENORM: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
 158 ; GCN-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
 159 ; GCN-NOT: v_mul
 160 ; GCN-NOT: v_max
 161 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 162 ; GCN-NOT: 1.0
 163 define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) {
 164   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 165   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 166   %load = load float, float addrspace(1)* %gep, align 4
 167   %v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0)
 168   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 169   store float %canonicalized, float addrspace(1)* %gep, align 4
 170   ret void
 171 }
 172
 173 ; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32:
 174 ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]],
 175 ; GCN-FLUSH:  v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
 176 ; GCN-DENORM: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]]
 177 ; GCN-NOT: v_mul
 178 ; GCN-NOT: v_max
 179 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 180 define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
 181   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 182   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 183   %load = load float, float addrspace(1)* %gep, align 4
 184   %v = call float @llvm.canonicalize.f32(float %load)
 185   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 186   store float %canonicalized, float addrspace(1)* %gep, align 4
 187   ret void
 188 }
 189
 190 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
 191 ; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 192 ; GCN-NOT: v_mul
 193 ; GCN-NOT: v_max
 194 ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
 195 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) {
 196   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 197   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 198   %load = load float, float addrspace(1)* %gep, align 4
 199   %v = fpext float %load to double
 200   %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
 201   %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
 202   store double %canonicalized, double addrspace(1)* %gep2, align 8
 203   ret void
 204 }
 205
 206 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
 207 ; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 208 ; GCN-NOT: v_mul
 209 ; GCN-NOT: v_max
 210 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 211 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) {
 212   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 213   %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
 214   %load = load half, half addrspace(1)* %gep, align 2
 215   %v = fpext half %load to float
 216   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 217   %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
 218   store float %canonicalized, float addrspace(1)* %gep2, align 4
 219   ret void
 220 }
 221
 222 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16_flushf16:
 223 ; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 224 ; GCN-NOT: v_mul
 225 ; GCN-NOT: v_max
 226 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 227 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16_flushf16(half addrspace(1)* %arg, float addrspace(1)* %out) #2 {
 228   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 229   %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
 230   %load = load half, half addrspace(1)* %gep, align 2
 231   %v = fpext half %load to float
 232   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 233   %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
 234   store float %canonicalized, float addrspace(1)* %gep2, align 4
 235   ret void
 236 }
 237
 238 ; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
 239 ; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}]
 240 ; GCN-NOT: v_mul
 241 ; GCN-NOT: v_max
 242 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 243 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) {
 244   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 245   %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
 246   %load = load double, double addrspace(1)* %gep, align 8
 247   %v = fptrunc double %load to float
 248   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 249   %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
 250   store float %canonicalized, float addrspace(1)* %gep2, align 4
 251   ret void
 252 }
 253
 254 ; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32:
 255 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 256 ; GCN-NOT: v_max
 257 ; GCN-NOT: v_mul
 258 ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
 259 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) {
 260   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 261   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 262   %load = load float, float addrspace(1)* %gep, align 4
 263   %v = fptrunc float %load to half
 264   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
 265   %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
 266   store half %canonicalized, half addrspace(1)* %gep2, align 2
 267   ret void
 268 }
 269
 270 ; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32_flushf16:
 271 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 272 ; GCN-NOT: v_max
 273 ; GCN-NOT: v_mul
 274 ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
 275 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16(float addrspace(1)* %arg, half addrspace(1)* %out) #2 {
 276   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 277   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 278   %load = load float, float addrspace(1)* %gep, align 4
 279   %v = fptrunc float %load to half
 280   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
 281   %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
 282   store half %canonicalized, half addrspace(1)* %gep2, align 2
 283   ret void
 284 }
 285
 286 ; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
 287 ; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
 288 ; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
 289 ; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
 290 ; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
 291 ; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]]
 292 ; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]]
 293 ; GCN-NOT: v_mul
 294 ; GCN-NOT: v_max
 295 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 296 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {
 297   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 298   %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
 299   %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
 300   %v = fptrunc <2 x float> %load to <2 x half>
 301   %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v)
 302   %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id
 303   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 4
 304   ret void
 305 }
 306
 307 ; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
 308 ; GCN-FLUSH:  v_mul_f32_e32 v{{[0-9]+}}, -1.0, v{{[0-9]+}}
 309 ; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
 310 define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
 311   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 312   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 313   %load = load float, float addrspace(1)* %gep, align 4
 314   %v = fsub float -0.0, %load
 315   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 316   store float %canonicalized, float addrspace(1)* %gep, align 4
 317   ret void
 318 }
 319
 320 ; GCN-LABEL: test_fold_canonicalize_fneg_value_f32:
 321 ; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}}
 322 ; GCN-NOT: v_mul
 323 ; GCN-NOT: v_max
 324 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 325 define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
 326   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 327   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 328   %load = load float, float addrspace(1)* %gep, align 4
 329   %v0 = fadd float %load, 0.0
 330   %v = fsub float -0.0, %v0
 331   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 332   store float %canonicalized, float addrspace(1)* %gep, align 4
 333   ret void
 334 }
 335
 336 ; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
 337 ; GCN-FLUSH:  v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
 338 ; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 339 define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
 340   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 341   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 342   %load = load float, float addrspace(1)* %gep, align 4
 343   %v = tail call float @llvm.fabs.f32(float %load)
 344   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 345   store float %canonicalized, float addrspace(1)* %gep, align 4
 346   ret void
 347 }
 348
 349 ; GCN-LABEL: test_no_fold_canonicalize_fcopysign_value_f32:
 350 ; GCN-FLUSH:  v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
 351 ; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 352 ; GCN-NOT: v_mul_
 353 ; GCN-NOT: v_max_
 354 define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(float addrspace(1)* %arg, float %sign) {
 355   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 356   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 357   %load = load float, float addrspace(1)* %gep, align 4
 358   %canon.load = tail call float @llvm.canonicalize.f32(float %load)
 359   %copysign = call float @llvm.copysign.f32(float %canon.load, float %sign)
 360   %v = tail call float @llvm.fabs.f32(float %load)
 361   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 362   store float %canonicalized, float addrspace(1)* %gep, align 4
 363   ret void
 364 }
 365
 366 ; GCN-LABEL: test_fold_canonicalize_fabs_value_f32:
 367 ; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}}
 368 ; GCN-NOT: v_mul
 369 ; GCN-NOT: v_max
 370 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 371 define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
 372   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 373   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 374   %load = load float, float addrspace(1)* %gep, align 4
 375   %v0 = fadd float %load, 0.0
 376   %v = tail call float @llvm.fabs.f32(float %v0)
 377   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 378   store float %canonicalized, float addrspace(1)* %gep, align 4
 379   ret void
 380 }
 381
 382 ; GCN-LABEL: test_fold_canonicalize_sin_value_f32:
 383 ; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 384 ; GCN-NOT: v_mul
 385 ; GCN-NOT: v_max
 386 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 387 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) {
 388   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 389   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 390   %load = load float, float addrspace(1)* %gep, align 4
 391   %v = tail call float @llvm.sin.f32(float %load)
 392   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 393   store float %canonicalized, float addrspace(1)* %gep, align 4
 394   ret void
 395 }
 396
 397 ; GCN-LABEL: test_fold_canonicalize_cos_value_f32:
 398 ; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 399 ; GCN-NOT: v_mul
 400 ; GCN-NOT: v_max
 401 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 402 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) {
 403   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 404   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 405   %load = load float, float addrspace(1)* %gep, align 4
 406   %v = tail call float @llvm.cos.f32(float %load)
 407   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 408   store float %canonicalized, float addrspace(1)* %gep, align 4
 409   ret void
 410 }
 411
 412 ; GCN-LABEL: test_fold_canonicalize_sin_value_f16:
 413 ; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
 414 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
 415 ; GCN-NOT: v_mul
 416 ; GCN-NOT: v_max
 417 ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
 418 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) {
 419   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 420   %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
 421   %load = load half, half addrspace(1)* %gep, align 2
 422   %v = tail call half @llvm.sin.f16(half %load)
 423   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
 424   store half %canonicalized, half addrspace(1)* %gep, align 2
 425   ret void
 426 }
 427
 428 ; GCN-LABEL: test_fold_canonicalize_cos_value_f16:
 429 ; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
 430 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
 431 ; GCN-NOT: v_mul
 432 ; GCN-NOT: v_max
 433 ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
 434 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) {
 435   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 436   %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
 437   %load = load half, half addrspace(1)* %gep, align 2
 438   %v = tail call half @llvm.cos.f16(half %load)
 439   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
 440   store half %canonicalized, half addrspace(1)* %gep, align 2
 441   ret void
 442 }
 443
 444 ; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32:
 445 ; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000
 446 ; GCN-NOT: v_mul
 447 ; GCN-NOT: v_max
 448 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 449 define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) {
 450   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 451   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 452   %canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
 453   store float %canonicalized, float addrspace(1)* %gep, align 4
 454   ret void
 455 }
 456
 457 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode:
 458 ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
 459 ; GCN-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
 460 ; GCN-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
 461 ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]]
 462
 463 ; GCN-NOT: v_max
 464 ; GCN-NOT: v_mul
 465
 466 ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 467 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
 468   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 469   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 470   %load = load float, float addrspace(1)* %gep, align 4
 471   %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
 472   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 473   store float %canonicalized, float addrspace(1)* %gep, align 4
 474   ret void
 475 }
 476
 477 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
 478 ; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
 479 ; GCN-DENORM-NOT: v_max
 480 ; GCN-DENORM-NOT: v_mul
 481
 482 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 483 ; GCN-DENORM-NOT: v_max
 484 ; GCN-DENORM-NOT: v_mul
 485
 486 ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}]
 487 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(float addrspace(1)* %arg) #1 {
 488   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 489   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 490   %load = load float, float addrspace(1)* %gep, align 4
 491   %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
 492   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 493   store float %canonicalized, float addrspace(1)* %gep, align 4
 494   ret void
 495 }
 496
 497 ; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
 498 ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
 499 ; GCN-NOT: v_mul
 500 ; GCN-NOT: v_max
 501 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 502 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) {
 503   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 504   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 505   %load = load float, float addrspace(1)* %gep, align 4
 506   %v0 = fadd float %load, 0.0
 507   %v = tail call float @llvm.minnum.f32(float %v0, float 0.0)
 508   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 509   store float %canonicalized, float addrspace(1)* %gep, align 4
 510   ret void
 511 }
 512
 513 ; FIXME: Should there be more checks here? minnum with NaN operand is simplified away.
 514
 515 ; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
 516 ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]]
 517 ; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]]
 518 ; GCN-DENORM: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]]
 519 define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
 520   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 521   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 522   %load = load float, float addrspace(1)* %gep, align 4
 523   %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float))
 524   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 525   store float %canonicalized, float addrspace(1)* %gep, align 4
 526   ret void
 527 }
 528
 529 ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
 530 ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
 531
 532 ; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
 533 ; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]]
 534
 535 ; GFX9-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
 536 ; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
 537
 538
 539 ; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]]
 540 ; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]]
 541
 542 ; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]]
 543
 544 ; GCN-NOT: v_mul
 545 ; GCN-NOT: v_max
 546 ; GCN:   {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
 547 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
 548   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 549   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 550   %load = load float, float addrspace(1)* %gep, align 4
 551   %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
 552   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 553   store float %canonicalized, float addrspace(1)* %gep, align 4
 554   ret void
 555 }
 556
 557 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
 558 ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
 559
 560 ; GFX9:  v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
 561
 562 ; VI-FLUSH:    v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
 563 ; VI-FLUSH:    v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
 564
 565 ; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
 566
 567 ; GCN-NOT: v_mul
 568 ; GCN-NOT: v_max
 569 ; GCN:  {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
 570 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
 571   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 572   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 573   %load = load float, float addrspace(1)* %gep, align 4
 574   %v = tail call float @llvm.maxnum.f32(float %load, float 0.0)
 575   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 576   store float %canonicalized, float addrspace(1)* %gep, align 4
 577   ret void
 578 }
 579
 580 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32:
 581 ; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
 582 ; GCN-NOT: v_max
 583 ; GCN-NOT: v_mul
 584 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 585 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) {
 586   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 587   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 588   %load = load float, float addrspace(1)* %gep, align 4
 589   %v0 = fadd float %load, 0.0
 590   %v = tail call float @llvm.maxnum.f32(float %v0, float 0.0)
 591   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 592   store float %canonicalized, float addrspace(1)* %gep, align 4
 593   ret void
 594 }
 595
 596 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64:
 597 ; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0
 598 ; GCN-NOT: v_mul
 599 ; GCN-NOT: v_max
 600 ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
 601 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) {
 602   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 603   %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
 604   %load = load double, double addrspace(1)* %gep, align 8
 605   %v0 = fadd double %load, 0.0
 606   %v = tail call double @llvm.maxnum.f64(double %v0, double 0.0)
 607   %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
 608   store double %canonicalized, double addrspace(1)* %gep, align 8
 609   ret void
 610 }
 611
 612 ; GCN-LABEL: test_fold_canonicalize_fmul_value_f32_no_ieee:
 613 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
 614 ; GCN-NOT: v_mul
 615 ; GCN-NOT: v_max
 616 ; GCN-NEXT: ; return
 617 define amdgpu_ps float @test_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) {
 618 entry:
 619   %v = fmul float %arg, 15.0
 620   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 621   ret float %canonicalized
 622 }
 623
 624 ; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
 625 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
 626 ; GCN-NOT: v_mul
 627 ; GCN-NOT: v_max
 628 ; GCN-NEXT: ; return
 629 define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) {
 630 entry:
 631   %v = fmul nnan float %arg, 15.0
 632   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 633   ret float %canonicalized
 634 }
 635
 636 ; GCN-LABEL: {{^}}test_fold_canonicalize_fdiv_value_f32_no_ieee:
 637 ; GCN: v_div_fixup_f32
 638 ; GCN-NOT: v_max
 639 ; GCN-NOT: v_mul
 640 ; GCN: ; return
 641 define amdgpu_ps float @test_fold_canonicalize_fdiv_value_f32_no_ieee(float %arg0) {
 642 entry:
 643   %v = fdiv float 15.0, %arg0
 644   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 645   ret float %canonicalized
 646 }
 647
 648 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32
 649 ; GFX9-DENORM: global_load_dword [[V:v[0-9]+]],
 650 ; GFX9-DENORM: global_store_dword v[{{[0-9:]+}}], [[V]]
 651 ; GFX9-DENORM-NOT: 1.0
 652 ; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
 653 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 {
 654   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 655   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 656   %v = load float, float addrspace(1)* %gep, align 4
 657   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 658   %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
 659   store float %canonicalized, float addrspace(1)* %gep2, align 4
 660   ret void
 661 }
 662
 663 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
 664 ; GCN: {{flat|global}}_load_dwordx2 [[V:v\[[0-9:]+\]]],
 665 ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
 666 ; GCN-NOT: v_mul_
 667 ; GCN-NOT: v_max_
 668 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
 669   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 670   %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
 671   %v = load double, double addrspace(1)* %gep, align 8
 672   %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
 673   %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
 674   store double %canonicalized, double addrspace(1)* %gep2, align 8
 675   ret void
 676 }
 677
 678 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
 679 ; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]],
 680 ; GCN-NOT: v_mul
 681 ; GCN-NOT: v_max
 682 ; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V]]
 683 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
 684   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 685   %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
 686   %v = load half, half addrspace(1)* %gep, align 2
 687   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
 688   %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
 689   store half %canonicalized, half addrspace(1)* %gep2, align 2
 690   ret void
 691 }
 692
 693 ; GCN-LABEL: {{^}}test_fold_canonicalize_select_value_f32:
 694 ; GCN: v_add_f32
 695 ; GCN: v_add_f32
 696 ; GCN: v_cndmask_b32
 697 ; GCN-NOT: v_mul_
 698 ; GCN-NOT: v_max_
 699 define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(float addrspace(1)* %arg) {
 700   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 701   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
 702   %load0 = load volatile float, float addrspace(1)* %gep, align 4
 703   %load1 = load volatile float, float addrspace(1)* %gep, align 4
 704   %load2 = load volatile i32, i32 addrspace(1)* undef, align 4
 705   %v0 = fadd float %load0, 15.0
 706   %v1 = fadd float %load1, 32.0
 707   %cond = icmp eq i32 %load2, 0
 708   %select = select i1 %cond, float %v0, float %v1
 709   %canonicalized = tail call float @llvm.canonicalize.f32(float %select)
 710   store float %canonicalized, float addrspace(1)* %gep, align 4
 711   ret void
 712 }
 713
 714 ; Need to quiet the nan with a separate instruction since it will be
 715 ; passed through the minnum.
 716 ; FIXME: canonicalize doens't work correctly without ieee_mode
 717
 718 ; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode:
 719 ; GFX9-NOT: v0
 720 ; GFX9-NOT: v1
 721 ; GFX9: v_min_f32_e32 v0, v0, v1
 722 ; GFX9-NEXT: ; return to shader
 723
 724 ; VI-FLUSH: v_min_f32_e32 v0, v0, v1
 725 ; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
 726 ; VI-FLUSH-NEXT: ; return
 727
 728 ; VI-DENORM-NOT: v0
 729 ; VI-DENORM: v_min_f32_e32 v0, v0, v1
 730 ; VI-DENORM-NEXT: ; return
 731 define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) {
 732   %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
 733   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 734   ret float %canonicalized
 735 }
 736
 737 ; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_ieee_mode:
 738 ; GFX9: v_min_f32_e32 v0, v0, v1
 739 ; GFX9-NEXT: s_setpc_b64
 740
 741 ; VI-FLUSH-DAG: v_mul_f32_e32 v0, 1.0, v0
 742 ; VI-FLUSH-DAG: v_mul_f32_e32 v1, 1.0, v1
 743 ; VI-FLUSH: v_min_f32_e32 v0, v0, v1
 744
 745 ; VI-DENORM-DAG: v_max_f32_e32 v0, v0, v0
 746 ; VI-DENORM-DAG: v_max_f32_e32 v1, v1, v1
 747 ; VI-DENORM: v_min_f32_e32 v0, v0, v1
 748
 749 ; VI-NEXT: s_setpc_b64
 750 define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) {
 751   %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
 752   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 753   ret float %canonicalized
 754 }
 755
 756 ; Canonicalizing flush necessary pre-gfx9
 757 ; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode_nnan:
 758 ; GCN: v_min_f32_e32 v0, v0, v1
 759 ; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
 760 ; GCN-NEXT: ; return
 761 define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode_nnan(float %arg0, float %arg1) #1 {
 762   %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
 763   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 764   ret float %canonicalized
 765 }
 766
 767 ; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16:
 768 ; GFX9-DAG: v_add_f16_e32
 769 ; GFX9-DAG: v_mul_f16_e32
 770 ; GFX9-NOT: v_max
 771 ; GFX9-NOT: v_pk_max
 772 define <2 x half> @v_test_canonicalize_build_vector_v2f16(<2 x half> %vec) {
 773   %lo = extractelement <2 x half> %vec, i32 0
 774   %hi = extractelement <2 x half> %vec, i32 1
 775   %lo.op = fadd half %lo, 1.0
 776   %hi.op = fmul half %lo, 4.0
 777   %ins0 = insertelement <2 x half> undef, half %lo.op, i32 0
 778   %ins1 = insertelement <2 x half> %ins0, half %hi.op, i32 1
 779   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
 780   ret <2 x half> %canonicalized
 781 }
 782
 783 ; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_noncanon1_v2f16:
 784 ; GFX9: v_add_f16_e32
 785 ; GFX9: v_pk_max
 786 define <2 x half> @v_test_canonicalize_build_vector_noncanon1_v2f16(<2 x half> %vec) {
 787   %lo = extractelement <2 x half> %vec, i32 0
 788   %lo.op = fadd half %lo, 1.0
 789   %ins = insertelement <2 x half> %vec, half %lo.op, i32 0
 790   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
 791   ret <2 x half> %canonicalized
 792 }
 793
 794 ; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_noncanon0_v2f16:
 795 ; GFX9: v_add_f16_sdwa
 796 ; GFX9: v_pk_max
 797 define <2 x half> @v_test_canonicalize_build_vector_noncanon0_v2f16(<2 x half> %vec) {
 798   %hi = extractelement <2 x half> %vec, i32 1
 799   %hi.op = fadd half %hi, 1.0
 800   %ins = insertelement <2 x half> %vec, half %hi.op, i32 1
 801   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
 802   ret <2 x half> %canonicalized
 803 }
 804
 805 ; GCN-LABEL: {{^}}v_test_canonicalize_extract_element_v2f16:
 806 ; GFX9: s_waitcnt
 807 ; GFX9-NEXT: v_mul_f16_e32 v0, 4.0, v0
 808 ; GFX9-NEXT: s_setpc_b64
 809 define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) {
 810   %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
 811   %elt = extractelement <2 x half> %vec.op, i32 0
 812   %canonicalized = call half @llvm.canonicalize.f16(half %elt)
 813   ret half %canonicalized
 814 }
 815
 816 ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16:
 817 ; GFX9: v_mul_f16_e32
 818 ; GFX9: v_pk_mul_f16
 819 ; GFX9-NOT: v_max
 820 ; GFX9-NOT: v_pk_max
 821 define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
 822   %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
 823   %ins.op = fmul half %val, 8.0
 824   %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx
 825   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
 826   ret <2 x half> %canonicalized
 827 }
 828
 829 ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_vec_v2f16:
 830 ; GFX9: v_mul_f16
 831 ; GFX9: v_pk_max_f16 v0, v0, v0
 832 ; GFX9-NEXT: s_setpc_b64
 833 define <2 x half> @v_test_canonicalize_insertelement_noncanon_vec_v2f16(<2 x half> %vec, half %val, i32 %idx) {
 834   %ins.op = fmul half %val, 8.0
 835   %ins = insertelement <2 x half> %vec, half %ins.op, i32 %idx
 836   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
 837   ret <2 x half> %canonicalized
 838 }
 839
 840 ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_insval_v2f16:
 841 ; GFX9: v_pk_mul_f16
 842 ; GFX9: v_pk_max_f16 v0, v0, v0
 843 ; GFX9-NEXT: s_setpc_b64
 844 define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x half> %vec, half %val, i32 %idx) {
 845   %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
 846   %ins = insertelement <2 x half> %vec.op, half %val, i32 %idx
 847   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
 848   ret <2 x half> %canonicalized
 849 }
 850
 851 ; GCN-LABEL: {{^}}v_test_canonicalize_cvt_pkrtz:
 852 ; GCN: s_waitcnt
 853 ; GCN-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
 854 ; GCN-NEXT: s_setpc_b64
 855 define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) {
 856   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
 857   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt)
 858   ret <2 x half> %canonicalized
 859 }
 860
 861 ; GCN-LABEL: {{^}}v_test_canonicalize_cubeid:
 862 ; GCN: s_waitcnt
 863 ; GCN-NEXT: v_cubeid_f32 v0, v0, v1, v2
 864 ; GCN-NEXT: s_setpc_b64
 865 define float @v_test_canonicalize_cubeid(float %a, float %b, float %c) {
 866   %cvt = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
 867   %canonicalized = call float @llvm.canonicalize.f32(float %cvt)
 868   ret float %canonicalized
 869 }
 870
 871 ; GCN-LABEL: {{^}}v_test_canonicalize_frexp_mant:
 872 ; GCN: s_waitcnt
 873 ; GCN-NEXT: v_frexp_mant_f32_e32 v0, v0
 874 ; GCN-NEXT: s_setpc_b64
 875 define float @v_test_canonicalize_frexp_mant(float %a) {
 876   %cvt = call float @llvm.amdgcn.frexp.mant.f32(float %a)
 877   %canonicalized = call float @llvm.canonicalize.f32(float %cvt)
 878   ret float %canonicalized
 879 }
 880
 881 ; Avoid failing the test on FreeBSD11.0 which will match the GCN-NOT: 1.0
 882 ; in the .amd_amdgpu_isa "amdgcn-unknown-freebsd11.0--gfx802" directive
 883 ; CHECK: .amd_amdgpu_isa
 884
 885 declare float @llvm.canonicalize.f32(float) #0
 886 declare float @llvm.copysign.f32(float, float) #0
 887 declare float @llvm.amdgcn.fmul.legacy(float, float) #0
 888 declare float @llvm.amdgcn.fmad.ftz.f32(float, float, float) #0
 889 declare double @llvm.canonicalize.f64(double) #0
 890 declare half @llvm.canonicalize.f16(half) #0
 891 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
 892 declare i32 @llvm.amdgcn.workitem.id.x() #0
 893 declare float @llvm.sqrt.f32(float) #0
 894 declare float @llvm.ceil.f32(float) #0
 895 declare float @llvm.floor.f32(float) #0
 896 declare float @llvm.fma.f32(float, float, float) #0
 897 declare float @llvm.fmuladd.f32(float, float, float) #0
 898 declare float @llvm.fabs.f32(float) #0
 899 declare float @llvm.sin.f32(float) #0
 900 declare float @llvm.cos.f32(float) #0
 901 declare half @llvm.sin.f16(half) #0
 902 declare half @llvm.cos.f16(half) #0
 903 declare float @llvm.minnum.f32(float, float) #0
 904 declare float @llvm.maxnum.f32(float, float) #0
 905 declare double @llvm.maxnum.f64(double, double) #0
 906 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #0
 907 declare float @llvm.amdgcn.cubeid(float, float, float) #0
 908 declare float @llvm.amdgcn.frexp.mant.f32(float) #0
 909
 910 attributes #0 = { nounwind readnone }
 911 attributes #1 = { "no-nans-fp-math"="true" }
 912 attributes #2 = { "target-features"="-fp64-fp16-denormals" }