llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll

   1 ; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH %s
   2 ; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM %s
   3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM %s
   4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH %s
   5
   6 ; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
   7 ; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
   8 ; GFX9: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
   9 define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(ptr addrspace(1) %arg) {
  10   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  11   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
  12   %v = load float, ptr addrspace(1) %gep, align 4
  13   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
  14   store float %canonicalized, ptr addrspace(1) %gep, align 4
  15   ret void
  16 }
  17
  18 ; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32:
  19 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
  20 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
  21 ; GCN-NOT: 1.0
  22 define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(ptr addrspace(1) %arg) {
  23   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  24   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
  25   %load = load float, ptr addrspace(1) %gep, align 4
  26   %v = fmul float %load, 15.0
  27   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
  28   store float %canonicalized, ptr addrspace(1) %gep, align 4
  29   ret void
  30 }
  31
  32 ; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_legacy_value_f32:
  33 ; GCN: v_mul_legacy_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
  34 ; GCN-NOT: v_mul
  35 ; GCN-NOT: v_max
  36 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
  37 define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(ptr addrspace(1) %arg) {
  38   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  39   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
  40   %load = load float, ptr addrspace(1) %gep, align 4
  41   %v = call float @llvm.amdgcn.fmul.legacy(float %load, float 15.0)
  42   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
  43   store float %canonicalized, ptr addrspace(1) %gep, align 4
  44   ret void
  45 }
  46
  47 ; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32:
  48 ; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
  49 ; GCN-NOT: v_mul
  50 ; GCN-NOT: v_max
  51 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
  52 define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(ptr addrspace(1) %arg) {
  53   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  54   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
  55   %load = load float, ptr addrspace(1) %gep, align 4
  56   %v = fsub float 15.0, %load
  57   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
  58   store float %canonicalized, ptr addrspace(1) %gep, align 4
  59   ret void
  60 }
  61
  62 ; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32:
  63 ; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
  64 ; GCN-NOT: v_mul
  65 ; GCN-NOT: v_max
  66 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
  67 define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(ptr addrspace(1) %arg) {
  68   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  69   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
  70   %load = load float, ptr addrspace(1) %gep, align 4
  71   %v = fadd float %load, 15.0
  72   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
  73   store float %canonicalized, ptr addrspace(1) %gep, align 4
  74   ret void
  75 }
  76
  77 ; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32:
  78 ; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
  79 ; GCN-NOT: v_mul
  80 ; GCN-NOT: v_max
  81 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
  82 define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(ptr addrspace(1) %arg) {
  83   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  84   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
  85   %load = load float, ptr addrspace(1) %gep, align 4
  86   %v = call afn float @llvm.sqrt.f32(float %load)
  87   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
  88   store float %canonicalized, ptr addrspace(1) %gep, align 4
  89   ret void
  90 }
  91
  92 ; GCN-LABEL: test_fold_canonicalize_fceil_value_f32:
  93 ; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
  94 ; GCN-NOT: v_mul
  95 ; GCN-NOT: v_max
  96 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
  97 define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(ptr addrspace(1) %arg) {
  98   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  99   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 100   %load = load float, ptr addrspace(1) %gep, align 4
 101   %v = call float @llvm.ceil.f32(float %load)
 102   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 103   store float %canonicalized, ptr addrspace(1) %gep, align 4
 104   ret void
 105 }
 106
 107 ; GCN-LABEL: test_fold_canonicalize_floor_value_f32:
 108 ; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 109 ; GCN-NOT: v_mul
 110 ; GCN-NOT: v_max
 111 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 112 define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(ptr addrspace(1) %arg) {
 113   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 114   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 115   %load = load float, ptr addrspace(1) %gep, align 4
 116   %v = call float @llvm.floor.f32(float %load)
 117   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 118   store float %canonicalized, ptr addrspace(1) %gep, align 4
 119   ret void
 120 }
 121
 122 ; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
 123 ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
 124 ; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
 125 ; GCN-NOT: v_mul
 126 ; GCN-NOT: v_max
 127 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 128 define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(ptr addrspace(1) %arg) {
 129   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 130   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 131   %load = load float, ptr addrspace(1) %gep, align 4
 132   %v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0)
 133   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 134   store float %canonicalized, ptr addrspace(1) %gep, align 4
 135   ret void
 136 }
 137
 138 ; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
 139 ; GCN: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+$}}
 140 ; GCN-NOT: v_mul
 141 ; GCN-NOT: v_max
 142 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 143 define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(ptr addrspace(1) %arg) {
 144   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 145   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 146   %load = load float, ptr addrspace(1) %gep, align 4
 147   %v = call float @llvm.amdgcn.fmad.ftz.f32(float %load, float 15.0, float 15.0)
 148   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 149   store float %canonicalized, ptr addrspace(1) %gep, align 4
 150   ret void
 151 }
 152
 153 ; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
 154 ; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
 155 ; GCN-DENORM: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
 156 ; GCN-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
 157 ; GCN-NOT: v_mul
 158 ; GCN-NOT: v_max
 159 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 160 ; GCN-NOT: 1.0
 161 define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(ptr addrspace(1) %arg) {
 162   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 163   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 164   %load = load float, ptr addrspace(1) %gep, align 4
 165   %v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0)
 166   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 167   store float %canonicalized, ptr addrspace(1) %gep, align 4
 168   ret void
 169 }
 170
 171 ; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32:
 172 ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]],
 173 ; VI:  v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
 174 ; GFX9: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]]
 175
 176 ; GCN-NOT: v_mul
 177 ; GCN-NOT: v_max
 178 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 179 define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(ptr addrspace(1) %arg) {
 180   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 181   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 182   %load = load float, ptr addrspace(1) %gep, align 4
 183   %v = call float @llvm.canonicalize.f32(float %load)
 184   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 185   store float %canonicalized, ptr addrspace(1) %gep, align 4
 186   ret void
 187 }
 188
 189 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
 190 ; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 191 ; GCN-NOT: v_mul
 192 ; GCN-NOT: v_max
 193 ; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[V]]
 194 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
 195   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 196   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 197   %load = load float, ptr addrspace(1) %gep, align 4
 198   %v = fpext float %load to double
 199   %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
 200   %gep2 = getelementptr inbounds double, ptr addrspace(1) %out, i32 %id
 201   store double %canonicalized, ptr addrspace(1) %gep2, align 8
 202   ret void
 203 }
 204
 205 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
 206 ; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 207 ; GCN-NOT: v_mul
 208 ; GCN-NOT: v_max
 209 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 210 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
 211   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 212   %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
 213   %load = load half, ptr addrspace(1) %gep, align 2
 214   %v = fpext half %load to float
 215   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 216   %gep2 = getelementptr inbounds float, ptr addrspace(1) %out, i32 %id
 217   store float %canonicalized, ptr addrspace(1) %gep2, align 4
 218   ret void
 219 }
 220
 221 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16_flushf16:
 222 ; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 223 ; GCN-NOT: v_mul
 224 ; GCN-NOT: v_max
 225 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 226 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16_flushf16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 {
 227   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 228   %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
 229   %load = load half, ptr addrspace(1) %gep, align 2
 230   %v = fpext half %load to float
 231   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 232   %gep2 = getelementptr inbounds float, ptr addrspace(1) %out, i32 %id
 233   store float %canonicalized, ptr addrspace(1) %gep2, align 4
 234   ret void
 235 }
 236
 237 ; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
 238 ; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}]
 239 ; GCN-NOT: v_mul
 240 ; GCN-NOT: v_max
 241 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 242 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
 243   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 244   %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
 245   %load = load double, ptr addrspace(1) %gep, align 8
 246   %v = fptrunc double %load to float
 247   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 248   %gep2 = getelementptr inbounds float, ptr addrspace(1) %out, i32 %id
 249   store float %canonicalized, ptr addrspace(1) %gep2, align 4
 250   ret void
 251 }
 252
 253 ; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32:
 254 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 255 ; GCN-NOT: v_max
 256 ; GCN-NOT: v_mul
 257 ; GCN: {{flat|global}}_store_short v{{.+}}, [[V]]
 258 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
 259   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 260   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 261   %load = load float, ptr addrspace(1) %gep, align 4
 262   %v = fptrunc float %load to half
 263   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
 264   %gep2 = getelementptr inbounds half, ptr addrspace(1) %out, i32 %id
 265   store half %canonicalized, ptr addrspace(1) %gep2, align 2
 266   ret void
 267 }
 268
 269 ; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32_flushf16:
 270 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 271 ; GCN-NOT: v_max
 272 ; GCN-NOT: v_mul
 273 ; GCN: {{flat|global}}_store_short v{{.+}}, [[V]]
 274 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 {
 275   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 276   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 277   %load = load float, ptr addrspace(1) %gep, align 4
 278   %v = fptrunc float %load to half
 279   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
 280   %gep2 = getelementptr inbounds half, ptr addrspace(1) %out, i32 %id
 281   store half %canonicalized, ptr addrspace(1) %gep2, align 2
 282   ret void
 283 }
 284
 285 ; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
 286 ; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
 287 ; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
 288 ; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
 289 ; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
 290 ; GFX9: v_pack_b32_f16 [[V:v[0-9]+]], [[V1]], [[V0]]
 291 ; GCN-NOT: v_mul
 292 ; GCN-NOT: v_max
 293 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 294 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
 295   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 296   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
 297   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
 298   %v = fptrunc <2 x float> %load to <2 x half>
 299   %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v)
 300   %gep2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i32 %id
 301   store <2 x half> %canonicalized, ptr addrspace(1) %gep2, align 4
 302   ret void
 303 }
 304
 305 ; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
 306 ; VI:  v_mul_f32_e32 v{{[0-9]+}}, -1.0, v{{[0-9]+}}
 307 ; GFX9: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
 308 define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(ptr addrspace(1) %arg) {
 309   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 310   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 311   %load = load float, ptr addrspace(1) %gep, align 4
 312   %v = fneg float %load
 313   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 314   store float %canonicalized, ptr addrspace(1) %gep, align 4
 315   ret void
 316 }
 317
 318 ; GCN-LABEL: test_fold_canonicalize_fneg_value_f32:
 319 ; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}}
 320 ; GCN-NOT: v_mul
 321 ; GCN-NOT: v_max
 322 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 323 define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(ptr addrspace(1) %arg) {
 324   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 325   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 326   %load = load float, ptr addrspace(1) %gep, align 4
 327   %v0 = fadd float %load, 0.0
 328   %v = fneg float %v0
 329   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 330   store float %canonicalized, ptr addrspace(1) %gep, align 4
 331   ret void
 332 }
 333
 334 ; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
 335 ; VI:  v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
 336 ; GFX9: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 337 define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(ptr addrspace(1) %arg) {
 338   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 339   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 340   %load = load float, ptr addrspace(1) %gep, align 4
 341   %v = tail call float @llvm.fabs.f32(float %load)
 342   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 343   store float %canonicalized, ptr addrspace(1) %gep, align 4
 344   ret void
 345 }
 346
 347 ; GCN-LABEL: test_no_fold_canonicalize_fcopysign_value_f32:
 348 ; VI:  v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
 349 ; GFX9: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 350
 351 ; GCN-NOT: v_mul_
 352 ; GCN-NOT: v_max_
 353 define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(ptr addrspace(1) %arg, float %sign) {
 354   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 355   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 356   %load = load float, ptr addrspace(1) %gep, align 4
 357   %canon.load = tail call float @llvm.canonicalize.f32(float %load)
 358   %copysign = call float @llvm.copysign.f32(float %canon.load, float %sign)
 359   %v = tail call float @llvm.fabs.f32(float %load)
 360   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 361   store float %canonicalized, ptr addrspace(1) %gep, align 4
 362   ret void
 363 }
 364
 365 ; GCN-LABEL: test_fold_canonicalize_fabs_value_f32:
 366 ; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}}
 367 ; GCN-NOT: v_mul
 368 ; GCN-NOT: v_max
 369 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 370 define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(ptr addrspace(1) %arg) {
 371   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 372   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 373   %load = load float, ptr addrspace(1) %gep, align 4
 374   %v0 = fadd float %load, 0.0
 375   %v = tail call float @llvm.fabs.f32(float %v0)
 376   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 377   store float %canonicalized, ptr addrspace(1) %gep, align 4
 378   ret void
 379 }
 380
 381 ; GCN-LABEL: test_fold_canonicalize_sin_value_f32:
 382 ; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 383 ; GCN-NOT: v_mul
 384 ; GCN-NOT: v_max
 385 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 386 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(ptr addrspace(1) %arg) {
 387   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 388   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 389   %load = load float, ptr addrspace(1) %gep, align 4
 390   %v = tail call float @llvm.sin.f32(float %load)
 391   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 392   store float %canonicalized, ptr addrspace(1) %gep, align 4
 393   ret void
 394 }
 395
 396 ; GCN-LABEL: test_fold_canonicalize_cos_value_f32:
 397 ; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
 398 ; GCN-NOT: v_mul
 399 ; GCN-NOT: v_max
 400 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 401 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(ptr addrspace(1) %arg) {
 402   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 403   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 404   %load = load float, ptr addrspace(1) %gep, align 4
 405   %v = tail call float @llvm.cos.f32(float %load)
 406   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 407   store float %canonicalized, ptr addrspace(1) %gep, align 4
 408   ret void
 409 }
 410
 411 ; GCN-LABEL: test_fold_canonicalize_sin_value_f16:
 412 ; GCN: v_sin_f16_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
 413 ; GCN-NOT: v_mul
 414 ; GCN-NOT: v_max
 415 ; GCN: {{flat|global}}_store_short v{{.+}}, [[V0]]
 416 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(ptr addrspace(1) %arg) {
 417   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 418   %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
 419   %load = load half, ptr addrspace(1) %gep, align 2
 420   %v = tail call half @llvm.sin.f16(half %load)
 421   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
 422   store half %canonicalized, ptr addrspace(1) %gep, align 2
 423   ret void
 424 }
 425
 426 ; GCN-LABEL: test_fold_canonicalize_cos_value_f16:
 427 ; GCN: v_cos_f16_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
 428 ; GCN-NOT: v_mul
 429 ; GCN-NOT: v_max
 430 ; GCN: {{flat|global}}_store_short v{{.+}}, [[V0]]
 431 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(ptr addrspace(1) %arg) {
 432   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 433   %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
 434   %load = load half, ptr addrspace(1) %gep, align 2
 435   %v = tail call half @llvm.cos.f16(half %load)
 436   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
 437   store half %canonicalized, ptr addrspace(1) %gep, align 2
 438   ret void
 439 }
 440
 441 ; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32:
 442 ; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000
 443 ; GCN-NOT: v_mul
 444 ; GCN-NOT: v_max
 445 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 446 define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(ptr addrspace(1) %arg) {
 447   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 448   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 449   %canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
 450   store float %canonicalized, ptr addrspace(1) %gep, align 4
 451   ret void
 452 }
 453
 454 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode:
 455 ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
 456 ; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
 457 ; GFX9: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
 458
 459 ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]]
 460 ; GCN-NOT: v_max
 461 ; GCN-NOT: v_mul
 462
 463 ; GFX9: {{flat|global}}_store_dword v{{.+}}, [[V]]
 464 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(ptr addrspace(1) %arg) {
 465   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 466   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 467   %load = load float, ptr addrspace(1) %gep, align 4
 468   %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
 469   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 470   store float %canonicalized, ptr addrspace(1) %gep, align 4
 471   ret void
 472 }
 473
 474 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
 475 ; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
 476 ; GCN-DENORM-NOT: v_max
 477 ; GCN-DENORM-NOT: v_mul
 478
 479 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 480 ; GCN-DENORM-NOT: v_max
 481 ; GCN-DENORM-NOT: v_mul
 482
 483 ; GFX9: {{flat|global}}_store_dword
 484 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 {
 485   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 486   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 487   %load = load float, ptr addrspace(1) %gep, align 4
 488   %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
 489   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 490   store float %canonicalized, ptr addrspace(1) %gep, align 4
 491   ret void
 492 }
 493
 494 ; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
 495 ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
 496 ; GCN-NOT: v_mul
 497 ; GCN-NOT: v_max
 498 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 499 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(ptr addrspace(1) %arg) {
 500   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 501   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 502   %load = load float, ptr addrspace(1) %gep, align 4
 503   %v0 = fadd float %load, 0.0
 504   %v = tail call float @llvm.minnum.f32(float %v0, float 0.0)
 505   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 506   store float %canonicalized, ptr addrspace(1) %gep, align 4
 507   ret void
 508 }
 509
 510 ; FIXME: Should there be more checks here? minnum with NaN operand is simplified away.
 511
 512 ; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
 513 ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]]
 514 ; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]]
 515 ; GFX9: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]]
 516 define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1) %arg) {
 517   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 518   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 519   %load = load float, ptr addrspace(1) %gep, align 4
 520   %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float))
 521   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 522   store float %canonicalized, ptr addrspace(1) %gep, align 4
 523   ret void
 524 }
 525
 526 ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
 527 ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
 528
 529 ; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
 530 ; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]]
 531
 532 ; GFX9-FLUSH: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
 533 ; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
 534
 535 ; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]]
 536 ; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]]
 537
 538 ; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]]
 539
 540 ; GCN-NOT: v_mul
 541 ; GCN-NOT: v_max
 542 ; GCN:   {{flat|global}}_store_dword v{{.+}}, [[RESULT]]
 543 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) {
 544   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 545   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 546   %load = load float, ptr addrspace(1) %gep, align 4
 547   %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
 548   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 549   store float %canonicalized, ptr addrspace(1) %gep, align 4
 550   ret void
 551 }
 552
 553 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
 554 ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
 555
 556 ; GFX9:  v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
 557
 558 ; VI-FLUSH:    v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
 559 ; VI-FLUSH:    v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
 560
 561 ; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
 562
 563 ; GCN-NOT: v_mul
 564 ; GCN-NOT: v_max
 565 ; GCN:  {{flat|global}}_store_dword v{{.+}}, [[RESULT]]
 566 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode(ptr addrspace(1) %arg) {
 567   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 568   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 569   %load = load float, ptr addrspace(1) %gep, align 4
 570   %v = tail call float @llvm.maxnum.f32(float %load, float 0.0)
 571   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 572   store float %canonicalized, ptr addrspace(1) %gep, align 4
 573   ret void
 574 }
 575
 576 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32:
 577 ; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
 578 ; GCN-NOT: v_max
 579 ; GCN-NOT: v_mul
 580 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 581 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(ptr addrspace(1) %arg) {
 582   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 583   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 584   %load = load float, ptr addrspace(1) %gep, align 4
 585   %v0 = fadd float %load, 0.0
 586   %v = tail call float @llvm.maxnum.f32(float %v0, float 0.0)
 587   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 588   store float %canonicalized, ptr addrspace(1) %gep, align 4
 589   ret void
 590 }
 591
 592 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64:
 593 ; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0
 594 ; GCN-NOT: v_mul
 595 ; GCN-NOT: v_max
 596 ; GCN:  {{flat|global}}_store_dwordx2 v{{.+}}, [[V]]
 597 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(ptr addrspace(1) %arg) {
 598   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 599   %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
 600   %load = load double, ptr addrspace(1) %gep, align 8
 601   %v0 = fadd double %load, 0.0
 602   %v = tail call double @llvm.maxnum.f64(double %v0, double 0.0)
 603   %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
 604   store double %canonicalized, ptr addrspace(1) %gep, align 8
 605   ret void
 606 }
 607
 608 ; GCN-LABEL: test_fold_canonicalize_fmul_value_f32_no_ieee:
 609 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
 610 ; GCN-NOT: v_mul
 611 ; GCN-NOT: v_max
 612 ; GCN-NEXT: ; return
 613 define amdgpu_ps float @test_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) {
 614 entry:
 615   %v = fmul float %arg, 15.0
 616   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 617   ret float %canonicalized
 618 }
 619
 620 ; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
 621 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
 622 ; GCN-NOT: v_mul
 623 ; GCN-NOT: v_max
 624 ; GCN-NEXT: ; return
 625 define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) {
 626 entry:
 627   %v = fmul nnan float %arg, 15.0
 628   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 629   ret float %canonicalized
 630 }
 631
 632 ; GCN-LABEL: {{^}}test_fold_canonicalize_fdiv_value_f32_no_ieee:
 633 ; GCN: v_div_fixup_f32
 634 ; GCN-NOT: v_max
 635 ; GCN-NOT: v_mul
 636 ; GCN: ; return
 637 define amdgpu_ps float @test_fold_canonicalize_fdiv_value_f32_no_ieee(float %arg0) {
 638 entry:
 639   %v = fdiv float 15.0, %arg0
 640   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 641   ret float %canonicalized
 642 }
 643
 644 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32
 645 ; GFX9-DENORM: global_load_dword [[V:v[0-9]+]],
 646 ; GFX9-DENORM: global_store_dword v{{[0-9]+}}, [[V]], s{{\[[0-9]+:[0-9]+\]}}
 647 ; GFX9-DENORM-NOT: 1.0
 648 ; GFX9-DENORM-NOT: v_max
 649 ; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
 650 ; GFX9-FLUSH: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 651 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 {
 652   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 653   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 654   %v = load float, ptr addrspace(1) %gep, align 4
 655   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 656   %gep2 = getelementptr inbounds float, ptr addrspace(1) %out, i32 %id
 657   store float %canonicalized, ptr addrspace(1) %gep2, align 4
 658   ret void
 659 }
 660
 661 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
 662 ; GCN: {{flat|global}}_load_dwordx2 [[V:v\[[0-9:]+\]]],
 663 ; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[V]]
 664 ; GCN-NOT: v_mul_
 665 ; GCN-NOT: v_max_
 666 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 {
 667   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 668   %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
 669   %v = load double, ptr addrspace(1) %gep, align 8
 670   %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
 671   %gep2 = getelementptr inbounds double, ptr addrspace(1) %out, i32 %id
 672   store double %canonicalized, ptr addrspace(1) %gep2, align 8
 673   ret void
 674 }
 675
 676 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
 677 ; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]],
 678 ; GCN-NOT: v_mul
 679 ; GCN-NOT: v_max
 680 ; GCN: {{flat|global}}_store_short v{{.+}}, [[V]]
 681 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 {
 682   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 683   %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
 684   %v = load half, ptr addrspace(1) %gep, align 2
 685   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
 686   %gep2 = getelementptr inbounds half, ptr addrspace(1) %out, i32 %id
 687   store half %canonicalized, ptr addrspace(1) %gep2, align 2
 688   ret void
 689 }
 690
 691 ; GCN-LABEL: {{^}}test_fold_canonicalize_select_value_f32:
 692 ; GCN: v_add_f32
 693 ; GCN: v_add_f32
 694 ; GCN: v_cndmask_b32
 695 ; GCN-NOT: v_mul_
 696 ; GCN-NOT: v_max_
 697 define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(ptr addrspace(1) %arg) {
 698   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
 699   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
 700   %load0 = load volatile float, ptr addrspace(1) %gep, align 4
 701   %load1 = load volatile float, ptr addrspace(1) %gep, align 4
 702   %load2 = load volatile i32, ptr addrspace(1) undef, align 4
 703   %v0 = fadd float %load0, 15.0
 704   %v1 = fadd float %load1, 32.0
 705   %cond = icmp eq i32 %load2, 0
 706   %select = select i1 %cond, float %v0, float %v1
 707   %canonicalized = tail call float @llvm.canonicalize.f32(float %select)
 708   store float %canonicalized, ptr addrspace(1) %gep, align 4
 709   ret void
 710 }
 711
 712 ; Need to quiet the nan with a separate instruction since it will be
 713 ; passed through the minnum.
 714 ; FIXME: canonicalize doens't work correctly without ieee_mode
 715
 716 ; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode:
 717 ; GFX9-NOT: v0
 718 ; GFX9-NOT: v1
 719 ; GFX9: v_min_f32_e32 v0, v0, v1
 720 ; GFX9-NEXT: ; return to shader
 721
 722 ; VI-FLUSH: v_min_f32_e32 v0, v0, v1
 723 ; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
 724 ; VI-FLUSH-NEXT: ; return
 725
 726 ; VI-DENORM-NOT: v0
 727 ; VI-DENORM: v_min_f32_e32 v0, v0, v1
 728 ; VI-DENORM-NEXT: ; return
 729 define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) {
 730   %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
 731   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 732   ret float %canonicalized
 733 }
 734
 735 ; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_ieee_mode:
 736 ; GFX9: v_min_f32_e32 v0, v0, v1
 737 ; GFX9-NEXT: s_setpc_b64
 738
 739 ; VI-DAG: v_mul_f32_e32 v0, 1.0, v0
 740 ; VI-DAG: v_mul_f32_e32 v1, 1.0, v1
 741 ; VI: v_min_f32_e32 v0, v0, v1
 742
 743 ; VI-NEXT: s_setpc_b64
 744 define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) {
 745   %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
 746   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 747   ret float %canonicalized
 748 }
 749
 750 ; Canonicalizing flush necessary pre-gfx9
 751 ; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode_nnan:
 752 ; GCN: v_min_f32_e32 v0, v0, v1
 753 ; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
 754 ; GCN-NEXT: ; return
 755 define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode_nnan(float %arg0, float %arg1) #1 {
 756   %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
 757   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
 758   ret float %canonicalized
 759 }
 760
 761 ; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16:
 762 ; GFX9-DAG: v_add_f16_e32
 763 ; GFX9-DAG: v_mul_f16_e32
 764 ; GFX9-NOT: v_max
 765 ; GFX9-NOT: v_pk_max
 766 define <2 x half> @v_test_canonicalize_build_vector_v2f16(<2 x half> %vec) {
 767   %lo = extractelement <2 x half> %vec, i32 0
 768   %hi = extractelement <2 x half> %vec, i32 1
 769   %lo.op = fadd half %lo, 1.0
 770   %hi.op = fmul half %lo, 4.0
 771   %ins0 = insertelement <2 x half> undef, half %lo.op, i32 0
 772   %ins1 = insertelement <2 x half> %ins0, half %hi.op, i32 1
 773   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
 774   ret <2 x half> %canonicalized
 775 }
 776
 777 ; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_noncanon1_v2f16:
 778 ; GFX9: v_add_f16_e32
 779 ; GFX9: v_pk_max
 780 define <2 x half> @v_test_canonicalize_build_vector_noncanon1_v2f16(<2 x half> %vec) {
 781   %lo = extractelement <2 x half> %vec, i32 0
 782   %lo.op = fadd half %lo, 1.0
 783   %ins = insertelement <2 x half> %vec, half %lo.op, i32 0
 784   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
 785   ret <2 x half> %canonicalized
 786 }
 787
 788 ; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_noncanon0_v2f16:
 789 ; GFX9: v_add_f16_sdwa
 790 ; GFX9: v_pk_max
 791 define <2 x half> @v_test_canonicalize_build_vector_noncanon0_v2f16(<2 x half> %vec) {
 792   %hi = extractelement <2 x half> %vec, i32 1
 793   %hi.op = fadd half %hi, 1.0
 794   %ins = insertelement <2 x half> %vec, half %hi.op, i32 1
 795   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
 796   ret <2 x half> %canonicalized
 797 }
 798
 799 ; GCN-LABEL: {{^}}v_test_canonicalize_extract_element_v2f16:
 800 ; GFX9: s_waitcnt
 801 ; GFX9-NEXT: v_mul_f16_e32 v0, 4.0, v0
 802 ; GFX9-NEXT: s_setpc_b64
 803 define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) {
 804   %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
 805   %elt = extractelement <2 x half> %vec.op, i32 0
 806   %canonicalized = call half @llvm.canonicalize.f16(half %elt)
 807   ret half %canonicalized
 808 }
 809
 810 ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16:
 811 ; GFX9: v_mul_f16_e32
 812 ; GFX9: v_pk_mul_f16
 813 ; GFX9-NOT: v_max
 814 ; GFX9-NOT: v_pk_max
 815 define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
 816   %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
 817   %ins.op = fmul half %val, 8.0
 818   %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx
 819   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
 820   ret <2 x half> %canonicalized
 821 }
 822
 823 ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_vec_v2f16:
 824 ; GFX9: v_mul_f16
 825 ; GFX9: v_pk_max_f16 v0, v0, v0
 826 ; GFX9-NEXT: s_setpc_b64
 827 define <2 x half> @v_test_canonicalize_insertelement_noncanon_vec_v2f16(<2 x half> %vec, half %val, i32 %idx) {
 828   %ins.op = fmul half %val, 8.0
 829   %ins = insertelement <2 x half> %vec, half %ins.op, i32 %idx
 830   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
 831   ret <2 x half> %canonicalized
 832 }
 833
 834 ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_insval_v2f16:
 835 ; GFX9: v_pk_mul_f16
 836 ; GFX9: v_pk_max_f16 v0, v0, v0
 837 ; GFX9-NEXT: s_setpc_b64
 838 define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x half> %vec, half %val, i32 %idx) {
 839   %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
 840   %ins = insertelement <2 x half> %vec.op, half %val, i32 %idx
 841   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
 842   ret <2 x half> %canonicalized
 843 }
 844
 845 ; GCN-LABEL: {{^}}v_test_canonicalize_cvt_pkrtz:
 846 ; GCN: s_waitcnt
 847 ; GCN-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
 848 ; GCN-NEXT: s_setpc_b64
 849 define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) {
 850   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
 851   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt)
 852   ret <2 x half> %canonicalized
 853 }
 854
 855 ; GCN-LABEL: {{^}}v_test_canonicalize_cubeid:
 856 ; GCN: s_waitcnt
 857 ; GCN-NEXT: v_cubeid_f32 v0, v0, v1, v2
 858 ; GCN-NEXT: s_setpc_b64
 859 define float @v_test_canonicalize_cubeid(float %a, float %b, float %c) {
 860   %cvt = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
 861   %canonicalized = call float @llvm.canonicalize.f32(float %cvt)
 862   ret float %canonicalized
 863 }
 864
 865 ; GCN-LABEL: {{^}}v_test_canonicalize_frexp_mant:
 866 ; GCN: s_waitcnt
 867 ; GCN-NEXT: v_frexp_mant_f32_e32 v0, v0
 868 ; GCN-NEXT: s_setpc_b64
 869 define float @v_test_canonicalize_frexp_mant(float %a) {
 870   %cvt = call float @llvm.amdgcn.frexp.mant.f32(float %a)
 871   %canonicalized = call float @llvm.canonicalize.f32(float %cvt)
 872   ret float %canonicalized
 873 }
 874
 875 ; GCN-LABEL: {{^}}v_test_canonicalize_amdgcn_log:
 876 ; GCN: s_waitcnt
 877 ; GCN-NEXT: v_log_f32
 878 ; GCN-NEXT: s_setpc_b64
 879 define float @v_test_canonicalize_amdgcn_log(float %a) {
 880   %log = call float @llvm.amdgcn.log.f32(float %a)
 881   %canonicalized = call float @llvm.canonicalize.f32(float %log)
 882   ret float %canonicalized
 883 }
 884
 885 ; GCN-LABEL: {{^}}v_test_canonicalize_amdgcn_exp2:
 886 ; GCN: s_waitcnt
 887 ; GCN-NEXT: v_exp_f32
 888 ; GCN-NEXT: s_setpc_b64
 889 define float @v_test_canonicalize_amdgcn_exp2(float %a) {
 890   %log = call float @llvm.amdgcn.exp2.f32(float %a)
 891   %canonicalized = call float @llvm.canonicalize.f32(float %log)
 892   ret float %canonicalized
 893 }
 894
 895 ; Avoid failing the test on FreeBSD11.0 which will match the GCN-NOT: 1.0
 896 ; in the .amd_amdgpu_isa "amdgcn-unknown-freebsd11.0--gfx802" directive
 897 ; GCN: .amd_amdgpu_isa
 898
 899 declare float @llvm.canonicalize.f32(float) #0
 900 declare float @llvm.copysign.f32(float, float) #0
 901 declare float @llvm.amdgcn.fmul.legacy(float, float) #0
 902 declare float @llvm.amdgcn.fmad.ftz.f32(float, float, float) #0
 903 declare double @llvm.canonicalize.f64(double) #0
 904 declare half @llvm.canonicalize.f16(half) #0
 905 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
 906 declare i32 @llvm.amdgcn.workitem.id.x() #0
 907 declare float @llvm.sqrt.f32(float) #0
 908 declare float @llvm.ceil.f32(float) #0
 909 declare float @llvm.floor.f32(float) #0
 910 declare float @llvm.fma.f32(float, float, float) #0
 911 declare float @llvm.fmuladd.f32(float, float, float) #0
 912 declare float @llvm.fabs.f32(float) #0
 913 declare float @llvm.sin.f32(float) #0
 914 declare float @llvm.cos.f32(float) #0
 915 declare half @llvm.sin.f16(half) #0
 916 declare half @llvm.cos.f16(half) #0
 917 declare float @llvm.minnum.f32(float, float) #0
 918 declare float @llvm.maxnum.f32(float, float) #0
 919 declare double @llvm.maxnum.f64(double, double) #0
 920 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #0
 921 declare float @llvm.amdgcn.cubeid(float, float, float) #0
 922 declare float @llvm.amdgcn.frexp.mant.f32(float) #0
 923 declare float @llvm.amdgcn.log.f32(float) #0
 924 declare float @llvm.amdgcn.exp2.f32(float) #0
 925
 926 attributes #0 = { nounwind readnone }
 927 attributes #1 = { "no-nans-fp-math"="true" }
 928 attributes #2 = { "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" }