test/CodeGen/AMDGPU/fneg-combines.ll

   1 ; RUN: llc -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s
   2 ; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s
   3
   4 ; --------------------------------------------------------------------------------
   5 ; fadd tests
   6 ; --------------------------------------------------------------------------------
   7
   8 ; GCN-LABEL: {{^}}v_fneg_add_f32:
   9 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
  10 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
  11
  12 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
  13 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
  14
  15 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
  16 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
  17 define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
  18   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  19   %tid.ext = sext i32 %tid to i64
  20   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
  21   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
  22   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
  23   %a = load volatile float, float addrspace(1)* %a.gep
  24   %b = load volatile float, float addrspace(1)* %b.gep
  25   %add = fadd float %a, %b
  26   %fneg = fsub float -0.000000e+00, %add
  27   store float %fneg, float addrspace(1)* %out.gep
  28   ret void
  29 }
  30
  31 ; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
  32 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
  33 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
  34 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
  35 ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
  36 ; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
  37 ; GCN-NEXT: buffer_store_dword [[ADD]]
  38 define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
  39   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  40   %tid.ext = sext i32 %tid to i64
  41   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
  42   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
  43   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
  44   %a = load volatile float, float addrspace(1)* %a.gep
  45   %b = load volatile float, float addrspace(1)* %b.gep
  46   %add = fadd float %a, %b
  47   %fneg = fsub float -0.000000e+00, %add
  48   store volatile float %fneg, float addrspace(1)* %out
  49   store volatile float %add, float addrspace(1)* %out
  50   ret void
  51 }
  52
  53 ; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
  54 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
  55 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
  56
  57 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
  58 ; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
  59 ; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
  60
  61 ; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]]
  62 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]]
  63 ; GCN: buffer_store_dword [[NEG_ADD]]
  64 ; GCN-NEXT: buffer_store_dword [[MUL]]
  65 define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
  66   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  67   %tid.ext = sext i32 %tid to i64
  68   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
  69   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
  70   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
  71   %a = load volatile float, float addrspace(1)* %a.gep
  72   %b = load volatile float, float addrspace(1)* %b.gep
  73   %add = fadd float %a, %b
  74   %fneg = fsub float -0.000000e+00, %add
  75   %use1 = fmul float %add, 4.0
  76   store volatile float %fneg, float addrspace(1)* %out
  77   store volatile float %use1, float addrspace(1)* %out
  78   ret void
  79 }
  80
  81 ; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32:
  82 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
  83 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
  84
  85 ; GCN-SAFE: v_sub_f32_e32
  86 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000,
  87
  88 ; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
  89 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
  90 define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
  91   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  92   %tid.ext = sext i32 %tid to i64
  93   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
  94   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
  95   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
  96   %a = load volatile float, float addrspace(1)* %a.gep
  97   %b = load volatile float, float addrspace(1)* %b.gep
  98   %fneg.a = fsub float -0.000000e+00, %a
  99   %add = fadd float %fneg.a, %b
 100   %fneg = fsub float -0.000000e+00, %add
 101   store volatile float %fneg, float addrspace(1)* %out
 102   ret void
 103 }
 104
 105 ; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32:
 106 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 107 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 108
 109 ; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 110 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
 111
 112 ; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 113 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
 114 define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 115   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 116   %tid.ext = sext i32 %tid to i64
 117   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 118   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 119   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 120   %a = load volatile float, float addrspace(1)* %a.gep
 121   %b = load volatile float, float addrspace(1)* %b.gep
 122   %fneg.b = fsub float -0.000000e+00, %b
 123   %add = fadd float %a, %fneg.b
 124   %fneg = fsub float -0.000000e+00, %add
 125   store volatile float %fneg, float addrspace(1)* %out
 126   ret void
 127 }
 128
 129 ; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32:
 130 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 131 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 132
 133 ; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]]
 134 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
 135
 136 ; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 137 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
 138 define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 139   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 140   %tid.ext = sext i32 %tid to i64
 141   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 142   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 143   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 144   %a = load volatile float, float addrspace(1)* %a.gep
 145   %b = load volatile float, float addrspace(1)* %b.gep
 146   %fneg.a = fsub float -0.000000e+00, %a
 147   %fneg.b = fsub float -0.000000e+00, %b
 148   %add = fadd float %fneg.a, %fneg.b
 149   %fneg = fsub float -0.000000e+00, %add
 150   store volatile float %fneg, float addrspace(1)* %out
 151   ret void
 152 }
 153
 154 ; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
 155 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 156 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 157
 158 ; GCN-SAFE: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1{{$}}
 159 ; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[A]], [[SIGNBIT]]
 160 ; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 161 ; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[ADD]], [[SIGNBIT]]
 162
 163 ; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 164 ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
 165 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
 166 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
 167 define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 168   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 169   %tid.ext = sext i32 %tid to i64
 170   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 171   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 172   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 173   %a = load volatile float, float addrspace(1)* %a.gep
 174   %b = load volatile float, float addrspace(1)* %b.gep
 175   %fneg.a = fsub float -0.000000e+00, %a
 176   %add = fadd float %fneg.a, %b
 177   %fneg = fsub float -0.000000e+00, %add
 178   store volatile float %fneg, float addrspace(1)* %out
 179   store volatile float %fneg.a, float addrspace(1)* %out
 180   ret void
 181 }
 182
 183 ; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32:
 184 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 185 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 186
 187 ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 188 ; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 189 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
 190
 191 ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
 192 ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 193 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
 194 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
 195 define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
 196   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 197   %tid.ext = sext i32 %tid to i64
 198   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 199   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 200   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 201   %a = load volatile float, float addrspace(1)* %a.gep
 202   %b = load volatile float, float addrspace(1)* %b.gep
 203   %fneg.a = fsub float -0.000000e+00, %a
 204   %add = fadd float %fneg.a, %b
 205   %fneg = fsub float -0.000000e+00, %add
 206   %use1 = fmul float %fneg.a, %c
 207   store volatile float %fneg, float addrspace(1)* %out
 208   store volatile float %use1, float addrspace(1)* %out
 209   ret void
 210 }
 211
 212 ; --------------------------------------------------------------------------------
 213 ; fmul tests
 214 ; --------------------------------------------------------------------------------
 215
 216 ; GCN-LABEL: {{^}}v_fneg_mul_f32:
 217 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 218 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 219 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
 220 ; GCN-NEXT: buffer_store_dword [[RESULT]]
 221 define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 222   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 223   %tid.ext = sext i32 %tid to i64
 224   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 225   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 226   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 227   %a = load volatile float, float addrspace(1)* %a.gep
 228   %b = load volatile float, float addrspace(1)* %b.gep
 229   %mul = fmul float %a, %b
 230   %fneg = fsub float -0.000000e+00, %mul
 231   store float %fneg, float addrspace(1)* %out.gep
 232   ret void
 233 }
 234
 235 ; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32:
 236 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 237 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 238 ; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 239 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
 240 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 241 ; GCN: buffer_store_dword [[ADD]]
 242 define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 243   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 244   %tid.ext = sext i32 %tid to i64
 245   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 246   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 247   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 248   %a = load volatile float, float addrspace(1)* %a.gep
 249   %b = load volatile float, float addrspace(1)* %b.gep
 250   %mul = fmul float %a, %b
 251   %fneg = fsub float -0.000000e+00, %mul
 252   store volatile float %fneg, float addrspace(1)* %out
 253   store volatile float %mul, float addrspace(1)* %out
 254   ret void
 255 }
 256
 257 ; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
 258 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 259 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 260 ; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]]
 261 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
 262 ; GCN-NEXT: buffer_store_dword [[MUL0]]
 263 ; GCN-NEXT: buffer_store_dword [[MUL1]]
 264 define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 265   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 266   %tid.ext = sext i32 %tid to i64
 267   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 268   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 269   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 270   %a = load volatile float, float addrspace(1)* %a.gep
 271   %b = load volatile float, float addrspace(1)* %b.gep
 272   %mul = fmul float %a, %b
 273   %fneg = fsub float -0.000000e+00, %mul
 274   %use1 = fmul float %mul, 4.0
 275   store volatile float %fneg, float addrspace(1)* %out
 276   store volatile float %use1, float addrspace(1)* %out
 277   ret void
 278 }
 279
 280 ; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32:
 281 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 282 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 283 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 284 ; GCN-NEXT: buffer_store_dword [[ADD]]
 285 define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 286   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 287   %tid.ext = sext i32 %tid to i64
 288   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 289   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 290   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 291   %a = load volatile float, float addrspace(1)* %a.gep
 292   %b = load volatile float, float addrspace(1)* %b.gep
 293   %fneg.a = fsub float -0.000000e+00, %a
 294   %mul = fmul float %fneg.a, %b
 295   %fneg = fsub float -0.000000e+00, %mul
 296   store volatile float %fneg, float addrspace(1)* %out
 297   ret void
 298 }
 299
 300 ; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32:
 301 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 302 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 303 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 304 ; GCN-NEXT: buffer_store_dword [[ADD]]
 305 define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 306   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 307   %tid.ext = sext i32 %tid to i64
 308   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 309   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 310   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 311   %a = load volatile float, float addrspace(1)* %a.gep
 312   %b = load volatile float, float addrspace(1)* %b.gep
 313   %fneg.b = fsub float -0.000000e+00, %b
 314   %mul = fmul float %a, %fneg.b
 315   %fneg = fsub float -0.000000e+00, %mul
 316   store volatile float %fneg, float addrspace(1)* %out
 317   ret void
 318 }
 319
 320 ; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32:
 321 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 322 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 323 ; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
 324 ; GCN-NEXT: buffer_store_dword [[ADD]]
 325 define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 326   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 327   %tid.ext = sext i32 %tid to i64
 328   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 329   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 330   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 331   %a = load volatile float, float addrspace(1)* %a.gep
 332   %b = load volatile float, float addrspace(1)* %b.gep
 333   %fneg.a = fsub float -0.000000e+00, %a
 334   %fneg.b = fsub float -0.000000e+00, %b
 335   %mul = fmul float %fneg.a, %fneg.b
 336   %fneg = fsub float -0.000000e+00, %mul
 337   store volatile float %fneg, float addrspace(1)* %out
 338   ret void
 339 }
 340
 341 ; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32:
 342 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 343 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 344 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 345 ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
 346 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 347 ; GCN: buffer_store_dword [[NEG_A]]
 348 define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 349   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 350   %tid.ext = sext i32 %tid to i64
 351   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 352   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 353   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 354   %a = load volatile float, float addrspace(1)* %a.gep
 355   %b = load volatile float, float addrspace(1)* %b.gep
 356   %fneg.a = fsub float -0.000000e+00, %a
 357   %mul = fmul float %fneg.a, %b
 358   %fneg = fsub float -0.000000e+00, %mul
 359   store volatile float %fneg, float addrspace(1)* %out
 360   store volatile float %fneg.a, float addrspace(1)* %out
 361   ret void
 362 }
 363
 364 ; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32:
 365 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 366 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 367 ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
 368 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 369 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 370 ; GCN: buffer_store_dword [[MUL]]
 371 define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
 372   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 373   %tid.ext = sext i32 %tid to i64
 374   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 375   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 376   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 377   %a = load volatile float, float addrspace(1)* %a.gep
 378   %b = load volatile float, float addrspace(1)* %b.gep
 379   %fneg.a = fsub float -0.000000e+00, %a
 380   %mul = fmul float %fneg.a, %b
 381   %fneg = fsub float -0.000000e+00, %mul
 382   %use1 = fmul float %fneg.a, %c
 383   store volatile float %fneg, float addrspace(1)* %out
 384   store volatile float %use1, float addrspace(1)* %out
 385   ret void
 386 }
 387
 388 ; --------------------------------------------------------------------------------
 389 ; fminnum tests
 390 ; --------------------------------------------------------------------------------
 391
 392 ; GCN-LABEL: {{^}}v_fneg_minnum_f32:
 393 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 394 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 395 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
 396 ; GCN: buffer_store_dword [[RESULT]]
 397 define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 398   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 399   %tid.ext = sext i32 %tid to i64
 400   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 401   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 402   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 403   %a = load volatile float, float addrspace(1)* %a.gep
 404   %b = load volatile float, float addrspace(1)* %b.gep
 405   %min = call float @llvm.minnum.f32(float %a, float %b)
 406   %fneg = fsub float -0.000000e+00, %min
 407   store float %fneg, float addrspace(1)* %out.gep
 408   ret void
 409 }
 410
 411 ; GCN-LABEL: {{^}}v_fneg_self_minnum_f32:
 412 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 413 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
 414 ; GCN: buffer_store_dword [[RESULT]]
 415 define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
 416   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 417   %tid.ext = sext i32 %tid to i64
 418   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 419   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 420   %a = load volatile float, float addrspace(1)* %a.gep
 421   %min = call float @llvm.minnum.f32(float %a, float %a)
 422   %min.fneg = fsub float -0.0, %min
 423   store float %min.fneg, float addrspace(1)* %out.gep
 424   ret void
 425 }
 426
 427 ; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32:
 428 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 429 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
 430 ; GCN: buffer_store_dword [[RESULT]]
 431 define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
 432   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 433   %tid.ext = sext i32 %tid to i64
 434   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 435   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 436   %a = load volatile float, float addrspace(1)* %a.gep
 437   %min = call float @llvm.minnum.f32(float 4.0, float %a)
 438   %fneg = fsub float -0.000000e+00, %min
 439   store float %fneg, float addrspace(1)* %out.gep
 440   ret void
 441 }
 442
 443 ; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32:
 444 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 445 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
 446 ; GCN: buffer_store_dword [[RESULT]]
 447 define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
 448   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 449   %tid.ext = sext i32 %tid to i64
 450   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 451   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 452   %a = load volatile float, float addrspace(1)* %a.gep
 453   %min = call float @llvm.minnum.f32(float -4.0, float %a)
 454   %fneg = fsub float -0.000000e+00, %min
 455   store float %fneg, float addrspace(1)* %out.gep
 456   ret void
 457 }
 458
 459 ; GCN-LABEL: {{^}}v_fneg_0_minnum_f32:
 460 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 461 ; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
 462 ; GCN: buffer_store_dword [[RESULT]]
 463 define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
 464   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 465   %tid.ext = sext i32 %tid to i64
 466   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 467   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 468   %a = load volatile float, float addrspace(1)* %a.gep
 469   %min = call float @llvm.minnum.f32(float 0.0, float %a)
 470   %fneg = fsub float -0.000000e+00, %min
 471   store float %fneg, float addrspace(1)* %out.gep
 472   ret void
 473 }
 474
 475 ; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32:
 476 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 477 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
 478 ; GCN: buffer_store_dword [[RESULT]]
 479 define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
 480   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 481   %tid.ext = sext i32 %tid to i64
 482   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 483   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 484   %a = load volatile float, float addrspace(1)* %a.gep
 485   %min = call float @llvm.minnum.f32(float -0.0, float %a)
 486   %fneg = fsub float -0.000000e+00, %min
 487   store float %fneg, float addrspace(1)* %out.gep
 488   ret void
 489 }
 490
 491 ; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32:
 492 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 493 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 494 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
 495 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
 496 ; GCN: buffer_store_dword [[RESULT]]
 497 define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 498   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 499   %tid.ext = sext i32 %tid to i64
 500   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 501   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 502   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 503   %a = load volatile float, float addrspace(1)* %a.gep
 504   %b = load volatile float, float addrspace(1)* %b.gep
 505   %min = call float @llvm.minnum.f32(float 0.0, float %a)
 506   %fneg = fsub float -0.000000e+00, %min
 507   %mul = fmul float %fneg, %b
 508   store float %mul, float addrspace(1)* %out.gep
 509   ret void
 510 }
 511
 512 ; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32:
 513 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 514 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 515 ; GCN: v_max_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
 516 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
 517 ; GCN-NEXT: buffer_store_dword [[MAX0]]
 518 ; GCN-NEXT: buffer_store_dword [[MUL1]]
 519 define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 520   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 521   %tid.ext = sext i32 %tid to i64
 522   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 523   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 524   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 525   %a = load volatile float, float addrspace(1)* %a.gep
 526   %b = load volatile float, float addrspace(1)* %b.gep
 527   %min = call float @llvm.minnum.f32(float %a, float %b)
 528   %fneg = fsub float -0.000000e+00, %min
 529   %use1 = fmul float %min, 4.0
 530   store volatile float %fneg, float addrspace(1)* %out
 531   store volatile float %use1, float addrspace(1)* %out
 532   ret void
 533 }
 534
 535 ; --------------------------------------------------------------------------------
 536 ; fmaxnum tests
 537 ; --------------------------------------------------------------------------------
 538
 539 ; GCN-LABEL: {{^}}v_fneg_maxnum_f32:
 540 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 541 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 542 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
 543 ; GCN: buffer_store_dword [[RESULT]]
 544 define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 545   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 546   %tid.ext = sext i32 %tid to i64
 547   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 548   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 549   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 550   %a = load volatile float, float addrspace(1)* %a.gep
 551   %b = load volatile float, float addrspace(1)* %b.gep
 552   %min = call float @llvm.maxnum.f32(float %a, float %b)
 553   %fneg = fsub float -0.000000e+00, %min
 554   store float %fneg, float addrspace(1)* %out.gep
 555   ret void
 556 }
 557
 558 ; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32:
 559 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 560 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
 561 ; GCN: buffer_store_dword [[RESULT]]
 562 define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
 563   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 564   %tid.ext = sext i32 %tid to i64
 565   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 566   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 567   %a = load volatile float, float addrspace(1)* %a.gep
 568   %min = call float @llvm.maxnum.f32(float %a, float %a)
 569   %min.fneg = fsub float -0.0, %min
 570   store float %min.fneg, float addrspace(1)* %out.gep
 571   ret void
 572 }
 573
 574 ; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32:
 575 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 576 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
 577 ; GCN: buffer_store_dword [[RESULT]]
 578 define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
 579   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 580   %tid.ext = sext i32 %tid to i64
 581   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 582   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 583   %a = load volatile float, float addrspace(1)* %a.gep
 584   %min = call float @llvm.maxnum.f32(float 4.0, float %a)
 585   %fneg = fsub float -0.000000e+00, %min
 586   store float %fneg, float addrspace(1)* %out.gep
 587   ret void
 588 }
 589
 590 ; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32:
 591 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 592 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
 593 ; GCN: buffer_store_dword [[RESULT]]
 594 define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
 595   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 596   %tid.ext = sext i32 %tid to i64
 597   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 598   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 599   %a = load volatile float, float addrspace(1)* %a.gep
 600   %min = call float @llvm.maxnum.f32(float -4.0, float %a)
 601   %fneg = fsub float -0.000000e+00, %min
 602   store float %fneg, float addrspace(1)* %out.gep
 603   ret void
 604 }
 605
 606 ; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32:
 607 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 608 ; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
 609 ; GCN: buffer_store_dword [[RESULT]]
 610 define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
 611   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 612   %tid.ext = sext i32 %tid to i64
 613   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 614   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 615   %a = load volatile float, float addrspace(1)* %a.gep
 616   %max = call float @llvm.maxnum.f32(float 0.0, float %a)
 617   %fneg = fsub float -0.000000e+00, %max
 618   store float %fneg, float addrspace(1)* %out.gep
 619   ret void
 620 }
 621
 622 ; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32:
 623 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 624 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
 625 ; GCN: buffer_store_dword [[RESULT]]
 626 define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
 627   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 628   %tid.ext = sext i32 %tid to i64
 629   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 630   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 631   %a = load volatile float, float addrspace(1)* %a.gep
 632   %max = call float @llvm.maxnum.f32(float -0.0, float %a)
 633   %fneg = fsub float -0.000000e+00, %max
 634   store float %fneg, float addrspace(1)* %out.gep
 635   ret void
 636 }
 637
 638 ; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32:
 639 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 640 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 641 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
 642 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
 643 ; GCN: buffer_store_dword [[RESULT]]
 644 define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 645   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 646   %tid.ext = sext i32 %tid to i64
 647   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 648   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 649   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 650   %a = load volatile float, float addrspace(1)* %a.gep
 651   %b = load volatile float, float addrspace(1)* %b.gep
 652   %max = call float @llvm.maxnum.f32(float 0.0, float %a)
 653   %fneg = fsub float -0.000000e+00, %max
 654   %mul = fmul float %fneg, %b
 655   store float %mul, float addrspace(1)* %out.gep
 656   ret void
 657 }
 658
 659 ; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32:
 660 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 661 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 662 ; GCN: v_min_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
 663 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
 664 ; GCN-NEXT: buffer_store_dword [[MAX0]]
 665 ; GCN-NEXT: buffer_store_dword [[MUL1]]
 666 define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
 667   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 668   %tid.ext = sext i32 %tid to i64
 669   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 670   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 671   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 672   %a = load volatile float, float addrspace(1)* %a.gep
 673   %b = load volatile float, float addrspace(1)* %b.gep
 674   %min = call float @llvm.maxnum.f32(float %a, float %b)
 675   %fneg = fsub float -0.000000e+00, %min
 676   %use1 = fmul float %min, 4.0
 677   store volatile float %fneg, float addrspace(1)* %out
 678   store volatile float %use1, float addrspace(1)* %out
 679   ret void
 680 }
 681
 682 ; --------------------------------------------------------------------------------
 683 ; fma tests
 684 ; --------------------------------------------------------------------------------
 685
 686 ; GCN-LABEL: {{^}}v_fneg_fma_f32:
 687 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 688 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 689 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
 690
 691 ; GCN-SAFE: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
 692 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[RESULT]]
 693
 694 ; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 695 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
 696 define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
 697   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 698   %tid.ext = sext i32 %tid to i64
 699   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 700   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 701   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
 702   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 703   %a = load volatile float, float addrspace(1)* %a.gep
 704   %b = load volatile float, float addrspace(1)* %b.gep
 705   %c = load volatile float, float addrspace(1)* %c.gep
 706   %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
 707   %fneg = fsub float -0.000000e+00, %fma
 708   store float %fneg, float addrspace(1)* %out.gep
 709   ret void
 710 }
 711
 712 ; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32:
 713 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 714 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 715 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
 716 ; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
 717 ; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
 718 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
 719 ; GCN-NEXT: buffer_store_dword [[FMA]]
 720 define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
 721   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 722   %tid.ext = sext i32 %tid to i64
 723   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 724   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 725   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
 726   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 727   %a = load volatile float, float addrspace(1)* %a.gep
 728   %b = load volatile float, float addrspace(1)* %b.gep
 729   %c = load volatile float, float addrspace(1)* %c.gep
 730   %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
 731   %fneg = fsub float -0.000000e+00, %fma
 732   store volatile float %fneg, float addrspace(1)* %out
 733   store volatile float %fma, float addrspace(1)* %out
 734   ret void
 735 }
 736
 737 ; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32:
 738 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 739 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 740 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
 741
 742 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
 743 ; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
 744 ; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
 745
 746 ; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
 747 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]
 748
 749 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
 750 ; GCN-NEXT: buffer_store_dword [[MUL]]
 751 define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
 752   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 753   %tid.ext = sext i32 %tid to i64
 754   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 755   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 756   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
 757   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 758   %a = load volatile float, float addrspace(1)* %a.gep
 759   %b = load volatile float, float addrspace(1)* %b.gep
 760   %c = load volatile float, float addrspace(1)* %c.gep
 761   %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
 762   %fneg = fsub float -0.000000e+00, %fma
 763   %use1 = fmul float %fma, 4.0
 764   store volatile float %fneg, float addrspace(1)* %out
 765   store volatile float %use1, float addrspace(1)* %out
 766   ret void
 767 }
 768
 769 ; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32:
 770 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 771 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 772 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
 773
 774 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], [[C]]
 775 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
 776
 777 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 778 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 779 define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
 780   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 781   %tid.ext = sext i32 %tid to i64
 782   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 783   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 784   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
 785   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 786   %a = load volatile float, float addrspace(1)* %a.gep
 787   %b = load volatile float, float addrspace(1)* %b.gep
 788   %c = load volatile float, float addrspace(1)* %c.gep
 789   %fneg.a = fsub float -0.000000e+00, %a
 790   %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
 791   %fneg = fsub float -0.000000e+00, %fma
 792   store volatile float %fneg, float addrspace(1)* %out
 793   ret void
 794 }
 795
 796 ; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32:
 797 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 798 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 799 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
 800
 801 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
 802 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
 803
 804 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 805 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 806 define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
 807   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 808   %tid.ext = sext i32 %tid to i64
 809   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 810   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
 811   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 812   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 813   %a = load volatile float, float addrspace(1)* %a.gep
 814   %b = load volatile float, float addrspace(1)* %b.gep
 815   %c = load volatile float, float addrspace(1)* %c.gep
 816   %fneg.b = fsub float -0.000000e+00, %b
 817   %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c)
 818   %fneg = fsub float -0.000000e+00, %fma
 819   store volatile float %fneg, float addrspace(1)* %out
 820   ret void
 821 }
 822
 823 ; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32:
 824 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 825 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 826 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
 827
 828 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], -[[B]], [[C]]
 829 ; GCN-SAFE: v_xor_b32_e32 v{{[[0-9]+}}, 0x80000000, [[FMA]]
 830
 831 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
 832 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 833 define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
 834   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 835   %tid.ext = sext i32 %tid to i64
 836   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 837   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 838   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
 839   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 840   %a = load volatile float, float addrspace(1)* %a.gep
 841   %b = load volatile float, float addrspace(1)* %b.gep
 842   %c = load volatile float, float addrspace(1)* %c.gep
 843   %fneg.a = fsub float -0.000000e+00, %a
 844   %fneg.b = fsub float -0.000000e+00, %b
 845   %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c)
 846   %fneg = fsub float -0.000000e+00, %fma
 847   store volatile float %fneg, float addrspace(1)* %out
 848   ret void
 849 }
 850
 851 ; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32:
 852 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 853 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 854 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
 855
 856 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], -[[C]]
 857 ; GCN-SAFE: v_xor_b32_e32 v{{[[0-9]+}}, 0x80000000, [[FMA]]
 858
 859 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
 860 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 861 define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
 862   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 863   %tid.ext = sext i32 %tid to i64
 864   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 865   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 866   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
 867   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 868   %a = load volatile float, float addrspace(1)* %a.gep
 869   %b = load volatile float, float addrspace(1)* %b.gep
 870   %c = load volatile float, float addrspace(1)* %c.gep
 871   %fneg.a = fsub float -0.000000e+00, %a
 872   %fneg.c = fsub float -0.000000e+00, %c
 873   %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c)
 874   %fneg = fsub float -0.000000e+00, %fma
 875   store volatile float %fneg, float addrspace(1)* %out
 876   ret void
 877 }
 878
 879 ; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32:
 880 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 881 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 882 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
 883
 884 ; GCN-NSZ-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 885 ; GCN-NSZ-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
 886
 887 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
 888 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 889 define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
 890   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 891   %tid.ext = sext i32 %tid to i64
 892   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 893   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 894   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
 895   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 896   %a = load volatile float, float addrspace(1)* %a.gep
 897   %b = load volatile float, float addrspace(1)* %b.gep
 898   %c = load volatile float, float addrspace(1)* %c.gep
 899   %fneg.c = fsub float -0.000000e+00, %c
 900   %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c)
 901   %fneg = fsub float -0.000000e+00, %fma
 902   store volatile float %fneg, float addrspace(1)* %out
 903   ret void
 904 }
 905
 906 ; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32:
 907 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 908 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 909 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
 910
 911 ; GCN-SAFE: v_xor_b32
 912 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]],
 913 ; GCN-SAFE: v_xor_b32
 914
 915 ; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 916 ; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 917 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 918 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
 919 define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
 920   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 921   %tid.ext = sext i32 %tid to i64
 922   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 923   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 924   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
 925   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 926   %a = load volatile float, float addrspace(1)* %a.gep
 927   %b = load volatile float, float addrspace(1)* %b.gep
 928   %c = load volatile float, float addrspace(1)* %c.gep
 929   %fneg.a = fsub float -0.000000e+00, %a
 930   %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
 931   %fneg = fsub float -0.000000e+00, %fma
 932   store volatile float %fneg, float addrspace(1)* %out
 933   store volatile float %fneg.a, float addrspace(1)* %out
 934   ret void
 935 }
 936
 937 ; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32:
 938 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 939 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 940 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
 941
 942 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 943 ; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]]
 944 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
 945
 946 ; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 947 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_FMA]]
 948 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
 949 define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
 950   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 951   %tid.ext = sext i32 %tid to i64
 952   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 953   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 954   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
 955   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 956   %a = load volatile float, float addrspace(1)* %a.gep
 957   %b = load volatile float, float addrspace(1)* %b.gep
 958   %c = load volatile float, float addrspace(1)* %c.gep
 959   %fneg.a = fsub float -0.000000e+00, %a
 960   %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
 961   %fneg = fsub float -0.000000e+00, %fma
 962   %use1 = fmul float %fneg.a, %d
 963   store volatile float %fneg, float addrspace(1)* %out
 964   store volatile float %use1, float addrspace(1)* %out
 965   ret void
 966 }
 967
 968 ; --------------------------------------------------------------------------------
 969 ; fmad tests
 970 ; --------------------------------------------------------------------------------
 971
 972 ; GCN-LABEL: {{^}}v_fneg_fmad_f32:
 973 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 974 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 975 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
 976
 977 ; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
 978 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]]
 979
 980 ; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 981 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
 982 define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
 983   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 984   %tid.ext = sext i32 %tid to i64
 985   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
 986   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
 987   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
 988   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
 989   %a = load volatile float, float addrspace(1)* %a.gep
 990   %b = load volatile float, float addrspace(1)* %b.gep
 991   %c = load volatile float, float addrspace(1)* %c.gep
 992   %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
 993   %fneg = fsub float -0.000000e+00, %fma
 994   store float %fneg, float addrspace(1)* %out.gep
 995   ret void
 996 }
 997
 998 ; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32:
 999 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1000 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1001 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1002
1003 ; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
1004 ; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
1005 ; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
1006
1007 ; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], -[[A]], [[B]], -[[C]]
1008 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
1009
1010 ; GCN: buffer_store_dword [[NEG_MAD]]
1011 ; GCN-NEXT: buffer_store_dword [[MUL]]
1012 define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1013   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1014   %tid.ext = sext i32 %tid to i64
1015   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1016   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1017   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1018   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1019   %a = load volatile float, float addrspace(1)* %a.gep
1020   %b = load volatile float, float addrspace(1)* %b.gep
1021   %c = load volatile float, float addrspace(1)* %c.gep
1022   %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
1023   %fneg = fsub float -0.000000e+00, %fma
1024   %use1 = fmul float %fma, 4.0
1025   store volatile float %fneg, float addrspace(1)* %out
1026   store volatile float %use1, float addrspace(1)* %out
1027   ret void
1028 }
1029
1030 ; --------------------------------------------------------------------------------
1031 ; fp_extend tests
1032 ; --------------------------------------------------------------------------------
1033
1034 ; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64:
1035 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1036 ; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
1037 ; GCN: buffer_store_dwordx2 [[RESULT]]
1038 define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1039   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1040   %tid.ext = sext i32 %tid to i64
1041   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1042   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1043   %a = load volatile float, float addrspace(1)* %a.gep
1044   %fpext = fpext float %a to double
1045   %fneg = fsub double -0.000000e+00, %fpext
1046   store double %fneg, double addrspace(1)* %out.gep
1047   ret void
1048 }
1049
1050 ; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64:
1051 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1052 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
1053 ; GCN: buffer_store_dwordx2 [[RESULT]]
1054 define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1055   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1056   %tid.ext = sext i32 %tid to i64
1057   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1058   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1059   %a = load volatile float, float addrspace(1)* %a.gep
1060   %fneg.a = fsub float -0.000000e+00, %a
1061   %fpext = fpext float %fneg.a to double
1062   %fneg = fsub double -0.000000e+00, %fpext
1063   store double %fneg, double addrspace(1)* %out.gep
1064   ret void
1065 }
1066
1067 ; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64:
1068 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1069 ; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
1070 ; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
1071 ; GCN: buffer_store_dwordx2 [[RESULT]]
1072 ; GCN: buffer_store_dword [[FNEG_A]]
1073 define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1074   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1075   %tid.ext = sext i32 %tid to i64
1076   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1077   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1078   %a = load volatile float, float addrspace(1)* %a.gep
1079   %fneg.a = fsub float -0.000000e+00, %a
1080   %fpext = fpext float %fneg.a to double
1081   %fneg = fsub double -0.000000e+00, %fpext
1082   store volatile double %fneg, double addrspace(1)* %out.gep
1083   store volatile float %fneg.a, float addrspace(1)* undef
1084   ret void
1085 }
1086
1087 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
1088 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1089 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
1090 ; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
1091 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
1092 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}
1093 define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1094   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1095   %tid.ext = sext i32 %tid to i64
1096   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1097   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1098   %a = load volatile float, float addrspace(1)* %a.gep
1099   %fpext = fpext float %a to double
1100   %fneg = fsub double -0.000000e+00, %fpext
1101   store volatile double %fneg, double addrspace(1)* %out.gep
1102   store volatile double %fpext, double addrspace(1)* undef
1103   ret void
1104 }
1105
1106 ; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
1107 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1108 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
1109 ; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
1110 ; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0
1111 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
1112 ; GCN: buffer_store_dwordx2 [[MUL]]
1113 define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1114   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1115   %tid.ext = sext i32 %tid to i64
1116   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1117   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1118   %a = load volatile float, float addrspace(1)* %a.gep
1119   %fpext = fpext float %a to double
1120   %fneg = fsub double -0.000000e+00, %fpext
1121   %mul = fmul double %fpext, 4.0
1122   store volatile double %fneg, double addrspace(1)* %out.gep
1123   store volatile double %mul, double addrspace(1)* %out.gep
1124   ret void
1125 }
1126
1127 ; FIXME: Source modifiers not folded for f16->f32
1128 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
1129 define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
1130   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1131   %tid.ext = sext i32 %tid to i64
1132   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
1133   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1134   %a = load volatile half, half addrspace(1)* %a.gep
1135   %fpext = fpext half %a to float
1136   %fneg = fsub float -0.000000e+00, %fpext
1137   store volatile float %fneg, float addrspace(1)* %out.gep
1138   store volatile float %fpext, float addrspace(1)* %out.gep
1139   ret void
1140 }
1141
1142 ; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
1143 define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
1144   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1145   %tid.ext = sext i32 %tid to i64
1146   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
1147   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1148   %a = load volatile half, half addrspace(1)* %a.gep
1149   %fpext = fpext half %a to float
1150   %fneg = fsub float -0.000000e+00, %fpext
1151   %mul = fmul float %fpext, 4.0
1152   store volatile float %fneg, float addrspace(1)* %out.gep
1153   store volatile float %mul, float addrspace(1)* %out.gep
1154   ret void
1155 }
1156
1157 ; --------------------------------------------------------------------------------
1158 ; fp_round tests
1159 ; --------------------------------------------------------------------------------
1160
1161 ; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32:
1162 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1163 ; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
1164 ; GCN: buffer_store_dword [[RESULT]]
1165 define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1166   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1167   %tid.ext = sext i32 %tid to i64
1168   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1169   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1170   %a = load volatile double, double addrspace(1)* %a.gep
1171   %fpround = fptrunc double %a to float
1172   %fneg = fsub float -0.000000e+00, %fpround
1173   store float %fneg, float addrspace(1)* %out.gep
1174   ret void
1175 }
1176
1177 ; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32:
1178 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1179 ; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
1180 ; GCN: buffer_store_dword [[RESULT]]
1181 define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1182   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1183   %tid.ext = sext i32 %tid to i64
1184   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1185   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1186   %a = load volatile double, double addrspace(1)* %a.gep
1187   %fneg.a = fsub double -0.000000e+00, %a
1188   %fpround = fptrunc double %fneg.a to float
1189   %fneg = fsub float -0.000000e+00, %fpround
1190   store float %fneg, float addrspace(1)* %out.gep
1191   ret void
1192 }
1193
1194 ; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32:
1195 ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
1196 ; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v{{\[}}[[A_LO]]:[[A_HI]]{{\]}}
1197 ; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
1198 ; GCN: buffer_store_dword [[RESULT]]
1199 ; GCN: buffer_store_dwordx2 v{{\[}}[[A_LO]]:[[NEG_A_HI]]{{\]}}
1200 define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1201   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1202   %tid.ext = sext i32 %tid to i64
1203   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1204   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1205   %a = load volatile double, double addrspace(1)* %a.gep
1206   %fneg.a = fsub double -0.000000e+00, %a
1207   %fpround = fptrunc double %fneg.a to float
1208   %fneg = fsub float -0.000000e+00, %fpround
1209   store volatile float %fneg, float addrspace(1)* %out.gep
1210   store volatile double %fneg.a, double addrspace(1)* undef
1211   ret void
1212 }
1213
1214 ; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32:
1215 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1216 ; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
1217 ; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}
1218 ; GCN: buffer_store_dword [[RESULT]]
1219 ; GCN: buffer_store_dwordx2 [[USE1]]
1220 define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
1221   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1222   %tid.ext = sext i32 %tid to i64
1223   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1224   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1225   %a = load volatile double, double addrspace(1)* %a.gep
1226   %fneg.a = fsub double -0.000000e+00, %a
1227   %fpround = fptrunc double %fneg.a to float
1228   %fneg = fsub float -0.000000e+00, %fpround
1229   %use1 = fmul double %fneg.a, %c
1230   store volatile float %fneg, float addrspace(1)* %out.gep
1231   store volatile double %use1, double addrspace(1)* undef
1232   ret void
1233 }
1234
1235 ; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16:
1236 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1237 ; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1238 ; GCN: buffer_store_short [[RESULT]]
1239 define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1240   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1241   %tid.ext = sext i32 %tid to i64
1242   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1243   %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1244   %a = load volatile float, float addrspace(1)* %a.gep
1245   %fpround = fptrunc float %a to half
1246   %fneg = fsub half -0.000000e+00, %fpround
1247   store half %fneg, half addrspace(1)* %out.gep
1248   ret void
1249 }
1250
1251 ; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16:
1252 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1253 ; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1254 ; GCN: buffer_store_short [[RESULT]]
1255 define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1256   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1257   %tid.ext = sext i32 %tid to i64
1258   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1259   %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1260   %a = load volatile float, float addrspace(1)* %a.gep
1261   %fneg.a = fsub float -0.000000e+00, %a
1262   %fpround = fptrunc float %fneg.a to half
1263   %fneg = fsub half -0.000000e+00, %fpround
1264   store half %fneg, half addrspace(1)* %out.gep
1265   ret void
1266 }
1267
1268 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32:
1269 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1270 ; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]]
1271 ; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
1272 ; GCN: buffer_store_dword [[NEG]]
1273 ; GCN: buffer_store_dword [[CVT]]
1274 define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1275   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1276   %tid.ext = sext i32 %tid to i64
1277   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1278   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1279   %a = load volatile double, double addrspace(1)* %a.gep
1280   %fpround = fptrunc double %a to float
1281   %fneg = fsub float -0.000000e+00, %fpround
1282   store volatile float %fneg, float addrspace(1)* %out.gep
1283   store volatile float %fpround, float addrspace(1)* %out.gep
1284   ret void
1285 }
1286
1287 ; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16:
1288 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1289 ; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1290 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1291 ; GCN: buffer_store_short [[RESULT]]
1292 ; GCN: buffer_store_dword [[NEG_A]]
1293 define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1294   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1295   %tid.ext = sext i32 %tid to i64
1296   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1297   %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1298   %a = load volatile float, float addrspace(1)* %a.gep
1299   %fneg.a = fsub float -0.000000e+00, %a
1300   %fpround = fptrunc float %fneg.a to half
1301   %fneg = fsub half -0.000000e+00, %fpround
1302   store volatile half %fneg, half addrspace(1)* %out.gep
1303   store volatile float %fneg.a, float addrspace(1)* undef
1304   ret void
1305 }
1306
1307 ; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16:
1308 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1309 ; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1310 ; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
1311 ; GCN: buffer_store_short [[RESULT]]
1312 ; GCN: buffer_store_dword [[USE1]]
1313 define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
1314   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1315   %tid.ext = sext i32 %tid to i64
1316   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1317   %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1318   %a = load volatile float, float addrspace(1)* %a.gep
1319   %fneg.a = fsub float -0.000000e+00, %a
1320   %fpround = fptrunc float %fneg.a to half
1321   %fneg = fsub half -0.000000e+00, %fpround
1322   %use1 = fmul float %fneg.a, %c
1323   store volatile half %fneg, half addrspace(1)* %out.gep
1324   store volatile float %use1, float addrspace(1)* undef
1325   ret void
1326 }
1327
1328 ; --------------------------------------------------------------------------------
1329 ; rcp tests
1330 ; --------------------------------------------------------------------------------
1331
1332 ; GCN-LABEL: {{^}}v_fneg_rcp_f32:
1333 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1334 ; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1335 ; GCN: buffer_store_dword [[RESULT]]
1336 define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1337   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1338   %tid.ext = sext i32 %tid to i64
1339   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1340   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1341   %a = load volatile float, float addrspace(1)* %a.gep
1342   %rcp = call float @llvm.amdgcn.rcp.f32(float %a)
1343   %fneg = fsub float -0.000000e+00, %rcp
1344   store float %fneg, float addrspace(1)* %out.gep
1345   ret void
1346 }
1347
1348 ; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32:
1349 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1350 ; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1351 ; GCN: buffer_store_dword [[RESULT]]
1352 define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1353   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1354   %tid.ext = sext i32 %tid to i64
1355   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1356   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1357   %a = load volatile float, float addrspace(1)* %a.gep
1358   %fneg.a = fsub float -0.000000e+00, %a
1359   %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1360   %fneg = fsub float -0.000000e+00, %rcp
1361   store float %fneg, float addrspace(1)* %out.gep
1362   ret void
1363 }
1364
1365 ; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32:
1366 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1367 ; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1368 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1369 ; GCN: buffer_store_dword [[RESULT]]
1370 ; GCN: buffer_store_dword [[NEG_A]]
1371 define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1372   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1373   %tid.ext = sext i32 %tid to i64
1374   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1375   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1376   %a = load volatile float, float addrspace(1)* %a.gep
1377   %fneg.a = fsub float -0.000000e+00, %a
1378   %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1379   %fneg = fsub float -0.000000e+00, %rcp
1380   store volatile float %fneg, float addrspace(1)* %out.gep
1381   store volatile float %fneg.a, float addrspace(1)* undef
1382   ret void
1383 }
1384
1385 ; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32:
1386 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1387 ; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1388 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
1389 ; GCN: buffer_store_dword [[RESULT]]
1390 ; GCN: buffer_store_dword [[MUL]]
1391 define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
1392   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1393   %tid.ext = sext i32 %tid to i64
1394   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1395   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1396   %a = load volatile float, float addrspace(1)* %a.gep
1397   %fneg.a = fsub float -0.000000e+00, %a
1398   %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1399   %fneg = fsub float -0.000000e+00, %rcp
1400   %use1 = fmul float %fneg.a, %c
1401   store volatile float %fneg, float addrspace(1)* %out.gep
1402   store volatile float %use1, float addrspace(1)* undef
1403   ret void
1404 }
1405
1406 ; --------------------------------------------------------------------------------
1407 ; rcp_legacy tests
1408 ; --------------------------------------------------------------------------------
1409
1410 ; GCN-LABEL: {{^}}v_fneg_rcp_legacy_f32:
1411 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1412 ; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1413 ; GCN: buffer_store_dword [[RESULT]]
1414 define amdgpu_kernel void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1415   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1416   %tid.ext = sext i32 %tid to i64
1417   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1418   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1419   %a = load volatile float, float addrspace(1)* %a.gep
1420   %rcp = call float @llvm.amdgcn.rcp.legacy(float %a)
1421   %fneg = fsub float -0.000000e+00, %rcp
1422   store float %fneg, float addrspace(1)* %out.gep
1423   ret void
1424 }
1425
1426 ; --------------------------------------------------------------------------------
1427 ; fmul_legacy tests
1428 ; --------------------------------------------------------------------------------
1429
1430 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32:
1431 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1432 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1433 ; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
1434 ; GCN-NEXT: buffer_store_dword [[RESULT]]
1435 define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1436   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1437   %tid.ext = sext i32 %tid to i64
1438   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1439   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1440   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1441   %a = load volatile float, float addrspace(1)* %a.gep
1442   %b = load volatile float, float addrspace(1)* %b.gep
1443   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1444   %fneg = fsub float -0.000000e+00, %mul
1445   store float %fneg, float addrspace(1)* %out.gep
1446   ret void
1447 }
1448
1449 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32:
1450 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1451 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1452 ; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1453 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
1454 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
1455 ; GCN: buffer_store_dword [[ADD]]
1456 define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1457   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1458   %tid.ext = sext i32 %tid to i64
1459   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1460   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1461   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1462   %a = load volatile float, float addrspace(1)* %a.gep
1463   %b = load volatile float, float addrspace(1)* %b.gep
1464   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1465   %fneg = fsub float -0.000000e+00, %mul
1466   store volatile float %fneg, float addrspace(1)* %out
1467   store volatile float %mul, float addrspace(1)* %out
1468   ret void
1469 }
1470
1471 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
1472 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1473 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1474 ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
1475 ; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
1476 ; GCN-NEXT: buffer_store_dword [[ADD]]
1477 ; GCN-NEXT: buffer_store_dword [[MUL]]
1478 define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1479   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1480   %tid.ext = sext i32 %tid to i64
1481   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1482   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1483   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1484   %a = load volatile float, float addrspace(1)* %a.gep
1485   %b = load volatile float, float addrspace(1)* %b.gep
1486   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1487   %fneg = fsub float -0.000000e+00, %mul
1488   %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0)
1489   store volatile float %fneg, float addrspace(1)* %out
1490   store volatile float %use1, float addrspace(1)* %out
1491   ret void
1492 }
1493
1494 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32:
1495 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1496 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1497 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1498 ; GCN-NEXT: buffer_store_dword [[ADD]]
1499 define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1500   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1501   %tid.ext = sext i32 %tid to i64
1502   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1503   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1504   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1505   %a = load volatile float, float addrspace(1)* %a.gep
1506   %b = load volatile float, float addrspace(1)* %b.gep
1507   %fneg.a = fsub float -0.000000e+00, %a
1508   %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
1509   %fneg = fsub float -0.000000e+00, %mul
1510   store volatile float %fneg, float addrspace(1)* %out
1511   ret void
1512 }
1513
1514 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32:
1515 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1516 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1517 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1518 ; GCN-NEXT: buffer_store_dword [[ADD]]
1519 define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1520   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1521   %tid.ext = sext i32 %tid to i64
1522   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1523   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1524   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1525   %a = load volatile float, float addrspace(1)* %a.gep
1526   %b = load volatile float, float addrspace(1)* %b.gep
1527   %fneg.b = fsub float -0.000000e+00, %b
1528   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b)
1529   %fneg = fsub float -0.000000e+00, %mul
1530   store volatile float %fneg, float addrspace(1)* %out
1531   ret void
1532 }
1533
1534 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32:
1535 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1536 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1537 ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
1538 ; GCN-NEXT: buffer_store_dword [[ADD]]
1539 define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1540   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1541   %tid.ext = sext i32 %tid to i64
1542   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1543   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1544   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1545   %a = load volatile float, float addrspace(1)* %a.gep
1546   %b = load volatile float, float addrspace(1)* %b.gep
1547   %fneg.a = fsub float -0.000000e+00, %a
1548   %fneg.b = fsub float -0.000000e+00, %b
1549   %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b)
1550   %fneg = fsub float -0.000000e+00, %mul
1551   store volatile float %fneg, float addrspace(1)* %out
1552   ret void
1553 }
1554
1555 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32:
1556 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1557 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1558 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1559 ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
1560 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
1561 ; GCN: buffer_store_dword [[NEG_A]]
1562 define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1563   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1564   %tid.ext = sext i32 %tid to i64
1565   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1566   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1567   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1568   %a = load volatile float, float addrspace(1)* %a.gep
1569   %b = load volatile float, float addrspace(1)* %b.gep
1570   %fneg.a = fsub float -0.000000e+00, %a
1571   %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
1572   %fneg = fsub float -0.000000e+00, %mul
1573   store volatile float %fneg, float addrspace(1)* %out
1574   store volatile float %fneg.a, float addrspace(1)* %out
1575   ret void
1576 }
1577
1578 ; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32:
1579 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1580 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1581 ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
1582 ; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
1583 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
1584 ; GCN: buffer_store_dword [[MUL]]
1585 define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
1586   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1587   %tid.ext = sext i32 %tid to i64
1588   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1589   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1590   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1591   %a = load volatile float, float addrspace(1)* %a.gep
1592   %b = load volatile float, float addrspace(1)* %b.gep
1593   %fneg.a = fsub float -0.000000e+00, %a
1594   %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
1595   %fneg = fsub float -0.000000e+00, %mul
1596   %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c)
1597   store volatile float %fneg, float addrspace(1)* %out
1598   store volatile float %use1, float addrspace(1)* %out
1599   ret void
1600 }
1601
1602 ; --------------------------------------------------------------------------------
1603 ; sin tests
1604 ; --------------------------------------------------------------------------------
1605
1606 ; GCN-LABEL: {{^}}v_fneg_sin_f32:
1607 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1608 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbe22f983, [[A]]
1609 ; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
1610 ; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
1611 ; GCN: buffer_store_dword [[RESULT]]
1612 define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1613   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1614   %tid.ext = sext i32 %tid to i64
1615   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1616   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1617   %a = load volatile float, float addrspace(1)* %a.gep
1618   %sin = call float @llvm.sin.f32(float %a)
1619   %fneg = fsub float -0.000000e+00, %sin
1620   store float %fneg, float addrspace(1)* %out.gep
1621   ret void
1622 }
1623
1624 ; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32:
1625 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1626 ; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1627 ; GCN: buffer_store_dword [[RESULT]]
1628 define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1629   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1630   %tid.ext = sext i32 %tid to i64
1631   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1632   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1633   %a = load volatile float, float addrspace(1)* %a.gep
1634   %sin = call float @llvm.amdgcn.sin.f32(float %a)
1635   %fneg = fsub float -0.0, %sin
1636   store float %fneg, float addrspace(1)* %out.gep
1637   ret void
1638 }
1639
1640 ; --------------------------------------------------------------------------------
1641 ; ftrunc tests
1642 ; --------------------------------------------------------------------------------
1643
1644 ; GCN-LABEL: {{^}}v_fneg_trunc_f32:
1645 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1646 ; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1647 ; GCN: buffer_store_dword [[RESULT]]
1648 define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1649   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1650   %tid.ext = sext i32 %tid to i64
1651   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1652   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1653   %a = load volatile float, float addrspace(1)* %a.gep
1654   %trunc = call float @llvm.trunc.f32(float %a)
1655   %fneg = fsub float -0.0, %trunc
1656   store float %fneg, float addrspace(1)* %out.gep
1657   ret void
1658 }
1659
1660 ; --------------------------------------------------------------------------------
1661 ; fround tests
1662 ; --------------------------------------------------------------------------------
1663
1664 ; GCN-LABEL: {{^}}v_fneg_round_f32:
1665 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1666 ; GCN: v_trunc_f32_e32
1667 ; GCN: v_sub_f32_e32
1668 ; GCN: v_cndmask_b32
1669
1670 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
1671 ; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]]
1672
1673 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
1674 ; GCN: buffer_store_dword [[RESULT]]
1675 define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1676   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1677   %tid.ext = sext i32 %tid to i64
1678   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1679   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1680   %a = load volatile float, float addrspace(1)* %a.gep
1681   %round = call float @llvm.round.f32(float %a)
1682   %fneg = fsub float -0.0, %round
1683   store float %fneg, float addrspace(1)* %out.gep
1684   ret void
1685 }
1686
1687 ; --------------------------------------------------------------------------------
1688 ; rint tests
1689 ; --------------------------------------------------------------------------------
1690
1691 ; GCN-LABEL: {{^}}v_fneg_rint_f32:
1692 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1693 ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1694 ; GCN: buffer_store_dword [[RESULT]]
1695 define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1696   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1697   %tid.ext = sext i32 %tid to i64
1698   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1699   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1700   %a = load volatile float, float addrspace(1)* %a.gep
1701   %rint = call float @llvm.rint.f32(float %a)
1702   %fneg = fsub float -0.0, %rint
1703   store float %fneg, float addrspace(1)* %out.gep
1704   ret void
1705 }
1706
1707 ; --------------------------------------------------------------------------------
1708 ; nearbyint tests
1709 ; --------------------------------------------------------------------------------
1710
1711 ; GCN-LABEL: {{^}}v_fneg_nearbyint_f32:
1712 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1713 ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1714 ; GCN: buffer_store_dword [[RESULT]]
1715 define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1716   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1717   %tid.ext = sext i32 %tid to i64
1718   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1719   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1720   %a = load volatile float, float addrspace(1)* %a.gep
1721   %nearbyint = call float @llvm.nearbyint.f32(float %a)
1722   %fneg = fsub float -0.0, %nearbyint
1723   store float %fneg, float addrspace(1)* %out.gep
1724   ret void
1725 }
1726
1727 ; --------------------------------------------------------------------------------
1728 ; vintrp tests
1729 ; --------------------------------------------------------------------------------
1730
1731 ; GCN-LABEL: {{^}}v_fneg_interp_p1_f32:
1732 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1733 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1734 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
1735 ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
1736 ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
1737 define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1738   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1739   %tid.ext = sext i32 %tid to i64
1740   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1741   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1742   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1743   %a = load volatile float, float addrspace(1)* %a.gep
1744   %b = load volatile float, float addrspace(1)* %b.gep
1745   %mul = fmul float %a, %b
1746   %fneg = fsub float -0.0, %mul
1747   %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0)
1748   %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0)
1749   store volatile float %intrp0, float addrspace(1)* %out.gep
1750   store volatile float %intrp1, float addrspace(1)* %out.gep
1751   ret void
1752 }
1753
1754 ; GCN-LABEL: {{^}}v_fneg_interp_p2_f32:
1755 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1756 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1757 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
1758 ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
1759 ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
1760 define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1761   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1762   %tid.ext = sext i32 %tid to i64
1763   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1764   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1765   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1766   %a = load volatile float, float addrspace(1)* %a.gep
1767   %b = load volatile float, float addrspace(1)* %b.gep
1768   %mul = fmul float %a, %b
1769   %fneg = fsub float -0.0, %mul
1770   %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0)
1771   %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0)
1772   store volatile float %intrp0, float addrspace(1)* %out.gep
1773   store volatile float %intrp1, float addrspace(1)* %out.gep
1774   ret void
1775 }
1776
1777 ; --------------------------------------------------------------------------------
1778 ; CopyToReg tests
1779 ; --------------------------------------------------------------------------------
1780
1781 ; GCN-LABEL: {{^}}v_fneg_copytoreg_f32:
1782 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1783 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1784 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1785 ; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]]
1786 ; GCN: s_cbranch_scc1
1787
1788 ; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
1789 ; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]]
1790 ; GCN: buffer_store_dword [[MUL1]]
1791
1792 ; GCN: buffer_store_dword [[MUL0]]
1793 define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
1794   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1795   %tid.ext = sext i32 %tid to i64
1796   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1797   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1798   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1799   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1800   %a = load volatile float, float addrspace(1)* %a.gep
1801   %b = load volatile float, float addrspace(1)* %b.gep
1802   %c = load volatile float, float addrspace(1)* %c.gep
1803   %mul = fmul float %a, %b
1804   %fneg = fsub float -0.0, %mul
1805   %cmp0 = icmp eq i32 %d, 0
1806   br i1 %cmp0, label %if, label %endif
1807
1808 if:
1809   %mul1 = fmul float %fneg, %c
1810   store volatile float %mul1, float addrspace(1)* %out.gep
1811   br label %endif
1812
1813 endif:
1814   store volatile float %mul, float addrspace(1)* %out.gep
1815   ret void
1816 }
1817
1818 ; --------------------------------------------------------------------------------
1819 ; inlineasm tests
1820 ; --------------------------------------------------------------------------------
1821
1822 ; Can't fold into use, so should fold into source
1823 ; GCN-LABEL: {{^}}v_fneg_inlineasm_f32:
1824 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1825 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1826 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
1827 ; GCN: ; use [[MUL]]
1828 ; GCN: buffer_store_dword [[MUL]]
1829 define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
1830   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1831   %tid.ext = sext i32 %tid to i64
1832   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1833   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1834   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1835   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1836   %a = load volatile float, float addrspace(1)* %a.gep
1837   %b = load volatile float, float addrspace(1)* %b.gep
1838   %c = load volatile float, float addrspace(1)* %c.gep
1839   %mul = fmul float %a, %b
1840   %fneg = fsub float -0.0, %mul
1841   call void asm sideeffect "; use $0", "v"(float %fneg) #0
1842   store volatile float %fneg, float addrspace(1)* %out.gep
1843   ret void
1844 }
1845
1846 ; --------------------------------------------------------------------------------
1847 ; inlineasm tests
1848 ; --------------------------------------------------------------------------------
1849
1850 ; Can't fold into use, so should fold into source
1851 ; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
1852 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1853 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1854 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
1855 ; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
1856 ; GCN: ; use [[NEG]]
1857 ; GCN: buffer_store_dword [[MUL]]
1858 define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
1859   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1860   %tid.ext = sext i32 %tid to i64
1861   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1862   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1863   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1864   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1865   %a = load volatile float, float addrspace(1)* %a.gep
1866   %b = load volatile float, float addrspace(1)* %b.gep
1867   %c = load volatile float, float addrspace(1)* %c.gep
1868   %mul = fmul float %a, %b
1869   %fneg = fsub float -0.0, %mul
1870   call void asm sideeffect "; use $0", "v"(float %fneg) #0
1871   store volatile float %mul, float addrspace(1)* %out.gep
1872   ret void
1873 }
1874
1875 ; --------------------------------------------------------------------------------
1876 ; code size regression tests
1877 ; --------------------------------------------------------------------------------
1878
1879 ; There are multiple users of the fneg that must use a VOP3
1880 ; instruction, so there is no penalty
1881 ; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32:
1882 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1883 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1884 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1885
1886 ; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]]
1887 ; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
1888 ; GCN-NEXT:     buffer_store_dword [[FMA0]]
1889 ; GCN-NEXT:     buffer_store_dword [[FMA1]]
1890 define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1891   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1892   %tid.ext = sext i32 %tid to i64
1893   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1894   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1895   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1896   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1897   %a = load volatile float, float addrspace(1)* %a.gep
1898   %b = load volatile float, float addrspace(1)* %b.gep
1899   %c = load volatile float, float addrspace(1)* %c.gep
1900
1901   %fneg.a = fsub float -0.0, %a
1902   %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1903   %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0)
1904
1905   store volatile float %fma0, float addrspace(1)* %out
1906   store volatile float %fma1, float addrspace(1)* %out
1907   ret void
1908 }
1909
1910 ; There are multiple users, but both require using a larger encoding
1911 ; for the modifier.
1912
1913 ; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32:
1914 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1915 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1916 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1917
1918 ; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
1919 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
1920 ; GCN-NEXT:     buffer_store_dword [[MUL0]]
1921 ; GCN-NEXT:     buffer_store_dword [[MUL1]]
1922 define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1923   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1924   %tid.ext = sext i32 %tid to i64
1925   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1926   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1927   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1928   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1929   %a = load volatile float, float addrspace(1)* %a.gep
1930   %b = load volatile float, float addrspace(1)* %b.gep
1931   %c = load volatile float, float addrspace(1)* %c.gep
1932
1933   %fneg.a = fsub float -0.0, %a
1934   %mul0 = fmul float %fneg.a, %b
1935   %mul1 = fmul float %fneg.a, %c
1936
1937   store volatile float %mul0, float addrspace(1)* %out
1938   store volatile float %mul1, float addrspace(1)* %out
1939   ret void
1940 }
1941
1942 ; One user is VOP3 so has no cost to folding the modifier, the other does.
1943 ; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32:
1944 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1945 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1946 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1947
1948 ; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0
1949 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
1950
1951 ; GCN:  buffer_store_dword [[FMA0]]
1952 ; GCN-NEXT:     buffer_store_dword [[MUL1]]
1953 define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1954   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1955   %tid.ext = sext i32 %tid to i64
1956   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1957   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1958   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1959   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1960   %a = load volatile float, float addrspace(1)* %a.gep
1961   %b = load volatile float, float addrspace(1)* %b.gep
1962   %c = load volatile float, float addrspace(1)* %c.gep
1963
1964   %fneg.a = fsub float -0.0, %a
1965   %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0)
1966   %mul1 = fmul float %fneg.a, %c
1967
1968   store volatile float %fma0, float addrspace(1)* %out
1969   store volatile float %mul1, float addrspace(1)* %out
1970   ret void
1971 }
1972
1973 ; The use of the fneg requires a code size increase, but folding into
1974 ; the source does not
1975
1976 ; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32:
1977 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1978 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1979 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1980 ; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
1981
1982 ; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0
1983 ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]]
1984 ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
1985
1986 ; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
1987 ; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]]
1988 ; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]]
1989
1990 ; GCN: buffer_store_dword [[MUL1]]
1991 ; GCN-NEXT:     buffer_store_dword [[MUL2]]
1992 define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
1993   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1994   %tid.ext = sext i32 %tid to i64
1995   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1996   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1997   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1998   %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
1999   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2000   %a = load volatile float, float addrspace(1)* %a.gep
2001   %b = load volatile float, float addrspace(1)* %b.gep
2002   %c = load volatile float, float addrspace(1)* %c.gep
2003   %d = load volatile float, float addrspace(1)* %d.gep
2004
2005   %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0)
2006   %fneg.fma0 = fsub float -0.0, %fma0
2007   %mul1 = fmul float %fneg.fma0, %c
2008   %mul2 = fmul float %fneg.fma0, %d
2009
2010   store volatile float %mul1, float addrspace(1)* %out
2011   store volatile float %mul2, float addrspace(1)* %out
2012   ret void
2013 }
2014
2015 ; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64:
2016 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
2017 ; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
2018 ; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]]
2019 ; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]]
2020
2021 ; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0
2022 ; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]]
2023 ; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]
2024
2025 ; GCN: buffer_store_dwordx2 [[MUL0]]
2026 ; GCN: buffer_store_dwordx2 [[MUL1]]
2027 define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
2028   %tid = call i32 @llvm.amdgcn.workitem.id.x()
2029   %tid.ext = sext i32 %tid to i64
2030   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
2031   %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext
2032   %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext
2033   %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext
2034   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
2035   %a = load volatile double, double addrspace(1)* %a.gep
2036   %b = load volatile double, double addrspace(1)* %b.gep
2037   %c = load volatile double, double addrspace(1)* %c.gep
2038   %d = load volatile double, double addrspace(1)* %d.gep
2039
2040   %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0)
2041   %fneg.fma0 = fsub double -0.0, %fma0
2042   %mul1 = fmul double %fneg.fma0, %c
2043   %mul2 = fmul double %fneg.fma0, %d
2044
2045   store volatile double %mul1, double addrspace(1)* %out
2046   store volatile double %mul2, double addrspace(1)* %out
2047   ret void
2048 }
2049
2050 ; %trunc.a has one fneg use, but it requires a code size increase and
2051 ; %the fneg can instead be folded for free into the fma.
2052
2053 ; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32:
2054 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2055 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2056 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2057 ; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
2058 ; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
2059 ; GCN: buffer_store_dword [[FMA0]]
2060 define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2061   %tid = call i32 @llvm.amdgcn.workitem.id.x()
2062   %tid.ext = sext i32 %tid to i64
2063   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2064   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2065   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2066   %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2067   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2068   %a = load volatile float, float addrspace(1)* %a.gep
2069   %b = load volatile float, float addrspace(1)* %b.gep
2070   %c = load volatile float, float addrspace(1)* %c.gep
2071   %d = load volatile float, float addrspace(1)* %d.gep
2072
2073   %trunc.a = call float @llvm.trunc.f32(float %a)
2074   %trunc.fneg.a = fsub float -0.0, %trunc.a
2075   %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
2076   store volatile float %fma0, float addrspace(1)* %out
2077   ret void
2078 }
2079
2080 ; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src:
2081 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2082 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2083 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2084 ; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
2085 ; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
2086 ; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
2087 ; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]]
2088 ; GCN: buffer_store_dword [[FMA0]]
2089 ; GCN: buffer_store_dword [[MUL1]]
2090 define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2091   %tid = call i32 @llvm.amdgcn.workitem.id.x()
2092   %tid.ext = sext i32 %tid to i64
2093   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2094   %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2095   %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2096   %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2097   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2098   %a = load volatile float, float addrspace(1)* %a.gep
2099   %b = load volatile float, float addrspace(1)* %b.gep
2100   %c = load volatile float, float addrspace(1)* %c.gep
2101   %d = load volatile float, float addrspace(1)* %d.gep
2102
2103   %trunc.a = call float @llvm.trunc.f32(float %a)
2104   %trunc.fneg.a = fsub float -0.0, %trunc.a
2105   %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
2106   %mul1 = fmul float %trunc.a, %d
2107   store volatile float %fma0, float addrspace(1)* %out
2108   store volatile float %mul1, float addrspace(1)* %out
2109   ret void
2110 }
2111
2112 declare i32 @llvm.amdgcn.workitem.id.x() #1
2113 declare float @llvm.fma.f32(float, float, float) #1
2114 declare float @llvm.fmuladd.f32(float, float, float) #1
2115 declare float @llvm.sin.f32(float) #1
2116 declare float @llvm.trunc.f32(float) #1
2117 declare float @llvm.round.f32(float) #1
2118 declare float @llvm.rint.f32(float) #1
2119 declare float @llvm.nearbyint.f32(float) #1
2120 declare float @llvm.minnum.f32(float, float) #1
2121 declare float @llvm.maxnum.f32(float, float) #1
2122
2123 declare double @llvm.fma.f64(double, double, double) #1
2124
2125 declare float @llvm.amdgcn.sin.f32(float) #1
2126 declare float @llvm.amdgcn.rcp.f32(float) #1
2127 declare float @llvm.amdgcn.rcp.legacy(float) #1
2128 declare float @llvm.amdgcn.fmul.legacy(float, float) #1
2129 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
2130 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
2131
2132 attributes #0 = { nounwind }
2133 attributes #1 = { nounwind readnone }