llvm/test/CodeGen/AMDGPU/fneg-fabs-divergence-driven-isel.ll

   1 ; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GCN,SI %s
   2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GCN,FP16 %s
   3
   4
   5 define amdgpu_kernel void @divergent_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   6 ; GCN-LABEL: name:            divergent_fneg_f32
   7 ; GCN-LABEL: bb.0 (%ir-block.0)
   8 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
   9 ; GCN: V_XOR_B32_e64 killed %[[REG]]
  10
  11   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  12   %tid.ext = sext i32 %tid to i64
  13   %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %tid.ext
  14   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
  15   %val = load volatile float, float addrspace(1)* %in.gep
  16   %fneg = fneg float %val
  17   store float %fneg, float addrspace(1)* %out.gep
  18   ret void
  19 }
  20
  21 define amdgpu_kernel void @uniform_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in, i64 %idx) {
  22 ; GCN-LABEL: name:            uniform_fneg_f32
  23 ; GCN-LABEL: bb.0 (%ir-block.0)
  24 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
  25 ; GCN: S_XOR_B32 killed %{{[0-9]+}}, killed %[[REG]]
  26
  27   %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %idx
  28   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %idx
  29   %val = load volatile float, float addrspace(1)* %in.gep
  30   %fneg = fneg float %val
  31   store float %fneg, float addrspace(1)* %out.gep
  32   ret void
  33 }
  34
  35 define amdgpu_kernel void @divergent_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
  36 ; GCN-LABEL: name:            divergent_fabs_f32
  37 ; GCN-LABEL: bb.0 (%ir-block.0)
  38 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
  39 ; GCN: V_AND_B32_e64 killed %[[REG]]
  40
  41   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  42   %tid.ext = sext i32 %tid to i64
  43   %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %tid.ext
  44   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
  45   %val = load volatile float, float addrspace(1)* %in.gep
  46   %fabs = call float @llvm.fabs.f32(float %val)
  47   store float %fabs, float addrspace(1)* %out.gep
  48   ret void
  49 }
  50
  51 define amdgpu_kernel void @uniform_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in, i64 %idx) {
  52 ; GCN-LABEL: name:            uniform_fabs_f32
  53 ; GCN-LABEL: bb.0 (%ir-block.0)
  54 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
  55 ; GCN: S_AND_B32 killed %{{[0-9]+}}, killed %[[REG]]
  56
  57   %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %idx
  58   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %idx
  59   %val = load volatile float, float addrspace(1)* %in.gep
  60   %fabs = call float @llvm.fabs.f32(float %val)
  61   store float %fabs, float addrspace(1)* %out.gep
  62   ret void
  63 }
  64
  65 define amdgpu_kernel void @divergent_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
  66 ; GCN-LABEL: name:            divergent_fneg_fabs_f32
  67 ; GCN-LABEL: bb.0 (%ir-block.0)
  68 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
  69 ; GCN: V_OR_B32_e64 killed %[[REG]]
  70
  71   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  72   %tid.ext = sext i32 %tid to i64
  73   %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %tid.ext
  74   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
  75   %val = load volatile float, float addrspace(1)* %in.gep
  76   %fabs = call float @llvm.fabs.f32(float %val)
  77   %fneg = fneg float %fabs
  78   store float %fneg, float addrspace(1)* %out.gep
  79   ret void
  80 }
  81
  82 define amdgpu_kernel void @uniform_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in, i64 %idx) {
  83 ; GCN-LABEL: name:            uniform_fneg_fabs_f32
  84 ; GCN-LABEL: bb.0 (%ir-block.0)
  85 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
  86 ; GCN: S_OR_B32 killed %{{[0-9]+}}, killed %[[REG]]
  87
  88   %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %idx
  89   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %idx
  90   %val = load volatile float, float addrspace(1)* %in.gep
  91   %fabs = call float @llvm.fabs.f32(float %val)
  92   %fneg = fneg float %fabs
  93   store float %fneg, float addrspace(1)* %out.gep
  94   ret void
  95 }
  96
  97
  98 define amdgpu_kernel void @divergent_fabs_f16(half addrspace(1)* %in, half addrspace(1)* %out) {
  99 ; GCN-LABEL: name:            divergent_fabs_f16
 100 ; GCN-LABEL: bb.0 (%ir-block.0)
 101 ; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32767
 102 ; FP16: V_AND_B32_e64 killed %[[REG]]
 103
 104   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 105   %tid.ext = sext i32 %tid to i64
 106   %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %tid.ext
 107   %val = load volatile half, half addrspace(1)* %in.gep
 108   %fabs = call half @llvm.fabs.f16(half %val)
 109   store half %fabs, half addrspace(1)* %out
 110   ret void
 111 }
 112
 113 define amdgpu_kernel void @uniform_fabs_f16(half addrspace(1)* %in, half addrspace(1)* %out, i64 %idx) {
 114 ; GCN-LABEL: name:            uniform_fabs_f16
 115 ; GCN-LABEL: bb.0 (%ir-block.0)
 116 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32767
 117 ; GCN: S_AND_B32 killed %{{[0-9]+}}, killed %[[REG]]
 118
 119   %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %idx
 120   %val = load volatile half, half addrspace(1)* %in.gep
 121   %fabs = call half @llvm.fabs.f16(half %val)
 122   store half %fabs, half addrspace(1)* %out
 123   ret void
 124 }
 125
 126 define amdgpu_kernel void @divergent_fneg_f16(half addrspace(1)* %in, half addrspace(1)* %out) {
 127 ; GCN-LABEL: name:            divergent_fneg_f16
 128 ; GCN-LABEL: bb.0 (%ir-block.0)
 129 ; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32768
 130 ; FP16: V_XOR_B32_e64 killed %[[REG]]
 131
 132   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 133   %tid.ext = sext i32 %tid to i64
 134   %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %tid.ext
 135   %val = load volatile half, half addrspace(1)* %in.gep
 136   %fneg = fneg half %val
 137   store half %fneg, half addrspace(1)* %out
 138   ret void
 139 }
 140
 141 define amdgpu_kernel void @uniform_fneg_f16(half addrspace(1)* %in, half addrspace(1)* %out, i64 %idx) {
 142 ; GCN-LABEL: name:            uniform_fneg_f16
 143 ; GCN-LABEL: bb.0 (%ir-block.0)
 144 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32768
 145 ; GCN: S_XOR_B32 killed %{{[0-9]+}}, killed %[[REG]]
 146
 147   %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %idx
 148   %val = load volatile half, half addrspace(1)* %in.gep
 149   %fneg = fneg half %val
 150   store half %fneg, half addrspace(1)* %out
 151   ret void
 152 }
 153
 154 define amdgpu_kernel void @divergent_fneg_fabs_f16(half addrspace(1)* %in, half addrspace(1)* %out) {
 155 ; GCN-LABEL: name:            divergent_fneg_fabs_f16
 156 ; GCN-LABEL: bb.0 (%ir-block.0)
 157 ; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32768
 158 ; FP16: V_OR_B32_e64 killed %[[REG]]
 159
 160   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 161   %tid.ext = sext i32 %tid to i64
 162   %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %tid.ext
 163   %val = load volatile half, half addrspace(1)* %in.gep
 164   %fabs = call half @llvm.fabs.f16(half %val)
 165   %fneg = fneg half %fabs
 166   store half %fneg, half addrspace(1)* %out
 167   ret void
 168 }
 169
 170 define amdgpu_kernel void @uniform_fneg_fabs_f16(half addrspace(1)* %in, half addrspace(1)* %out, i64 %idx) {
 171 ; GCN-LABEL: name:            uniform_fneg_fabs_f16
 172 ; GCN-LABEL: bb.0 (%ir-block.0)
 173 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32768
 174 ; GCN: S_OR_B32 killed %{{[0-9]+}}, killed %[[REG]]
 175
 176   %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %idx
 177   %val = load volatile half, half addrspace(1)* %in.gep
 178   %fabs = call half @llvm.fabs.f16(half %val)
 179   %fneg = fneg half %fabs
 180   store half %fneg, half addrspace(1)* %out
 181   ret void
 182 }
 183
 184 define amdgpu_kernel void @divergent_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) {
 185 ; GCN-LABEL: name:            divergent_fneg_v2f16
 186 ; GCN-LABEL: bb.0 (%ir-block.0)
 187 ; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147450880
 188 ; FP16: V_XOR_B32_e64 killed %[[REG]]
 189
 190   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 191   %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
 192   %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
 193   %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
 194   %fneg = fneg <2 x half> %val
 195   store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out
 196   ret void
 197 }
 198
 199 define amdgpu_kernel void @uniform_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 %idx) {
 200 ; GCN-LABEL: name:            uniform_fneg_v2f16
 201 ; GCN-LABEL: bb.0 (%ir-block.0)
 202 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147450880
 203 ; GCN: S_XOR_B32 killed %{{[0-9]+}}, killed %[[REG]]
 204
 205   %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx
 206   %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx
 207   %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
 208   %fneg = fneg <2 x half> %val
 209   store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out
 210   ret void
 211 }
 212
 213 define amdgpu_kernel void @divergent_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) {
 214 ; GCN-LABEL: name:            divergent_fabs_v2f16
 215 ; GCN-LABEL: bb.0 (%ir-block.0)
 216 ; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147450879
 217 ; FP16: V_AND_B32_e64 killed %[[REG]]
 218
 219   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 220   %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
 221   %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
 222   %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
 223   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
 224   store <2 x half> %fabs, <2 x half> addrspace(1)* %gep.out
 225   ret void
 226 }
 227
 228 define amdgpu_kernel void @uniform_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 %idx) {
 229 ; GCN-LABEL: name:            uniform_fabs_v2f16
 230 ; GCN-LABEL: bb.0 (%ir-block.0)
 231 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147450879
 232 ; GCN: S_AND_B32 killed %{{[0-9]+}}, killed %[[REG]]
 233
 234   %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx
 235   %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx
 236   %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
 237   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
 238   store <2 x half> %fabs, <2 x half> addrspace(1)* %gep.out
 239   ret void
 240 }
 241
 242 define amdgpu_kernel void @divergent_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) {
 243 ; GCN-LABEL: name:            divergent_fneg_fabs_v2f16
 244 ; GCN-LABEL: bb.0 (%ir-block.0)
 245 ; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147450880
 246 ; FP16: V_OR_B32_e64 killed %[[REG]]
 247
 248   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 249   %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
 250   %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
 251   %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
 252   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
 253   %fneg = fneg <2 x half> %fabs
 254   store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out
 255   ret void
 256 }
 257
 258 define amdgpu_kernel void @uniform_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 %idx) {
 259 ; GCN-LABEL: name:            uniform_fneg_fabs_v2f16
 260 ; GCN-LABEL: bb.0 (%ir-block.0)
 261 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147450880
 262 ; GCN: S_OR_B32 killed %{{[0-9]+}}, killed %[[REG]]
 263
 264   %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx
 265   %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx
 266   %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
 267   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
 268   %fneg = fneg <2 x half> %fabs
 269   store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out
 270   ret void
 271 }
 272
 273 define amdgpu_kernel void @divergent_fneg_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 274 ; GCN-LABEL: name:            divergent_fneg_v2f32
 275 ; GCN-LABEL: bb.0 (%ir-block.0)
 276 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
 277 ; GCN: V_XOR_B32_e64 %[[REG]]
 278 ; GCN: V_XOR_B32_e64 %[[REG]]
 279
 280   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 281   %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
 282   %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
 283   %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4
 284   %fneg = fneg <2 x float> %val
 285   store <2 x float> %fneg, <2 x float> addrspace(1)* %gep.out
 286   ret void
 287 }
 288
 289 define amdgpu_kernel void @uniform_fneg_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in, i32 %idx) {
 290 ; GCN-LABEL: name:            uniform_fneg_v2f32
 291 ; GCN-LABEL: bb.0 (%ir-block.0)
 292 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
 293 ; GCN: S_XOR_B32 killed %{{[0-9]+}}, %[[REG]]
 294 ; GCN: S_XOR_B32 killed %{{[0-9]+}}, %[[REG]]
 295
 296   %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx
 297   %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx
 298   %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4
 299   %fneg = fneg <2 x float> %val
 300   store <2 x float> %fneg, <2 x float> addrspace(1)* %gep.out
 301   ret void
 302 }
 303
 304 define amdgpu_kernel void @divergent_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 305 ; GCN-LABEL: name:            divergent_fabs_v2f32
 306 ; GCN-LABEL: bb.0 (%ir-block.0)
 307 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
 308 ; GCN: V_AND_B32_e64 %[[REG]]
 309 ; GCN: V_AND_B32_e64 %[[REG]]
 310
 311   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 312   %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
 313   %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
 314   %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4
 315   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %val)
 316   store <2 x float> %fabs, <2 x float> addrspace(1)* %gep.out
 317   ret void
 318 }
 319
 320 define amdgpu_kernel void @uniform_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in, i32 %idx) {
 321 ; GCN-LABEL: name:            uniform_fabs_v2f32
 322 ; GCN-LABEL: bb.0 (%ir-block.0)
 323 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
 324 ; GCN: S_AND_B32 killed %{{[0-9]+}}, %[[REG]]
 325 ; GCN: S_AND_B32 killed %{{[0-9]+}}, %[[REG]]
 326
 327   %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx
 328   %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx
 329   %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4
 330   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %val)
 331   store <2 x float> %fabs, <2 x float> addrspace(1)* %gep.out
 332   ret void
 333 }
 334
 335 define amdgpu_kernel void @divergent_fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 336 ; GCN-LABEL: name:            divergent_fneg_fabs_v2f32
 337 ; GCN-LABEL: bb.0 (%ir-block.0)
 338 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
 339 ; GCN: V_OR_B32_e64 %[[REG]]
 340 ; GCN: V_OR_B32_e64 %[[REG]]
 341
 342   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 343   %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
 344   %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
 345   %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4
 346   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %val)
 347   %fneg = fneg <2 x float> %fabs
 348   store <2 x float> %fneg, <2 x float> addrspace(1)* %gep.out
 349   ret void
 350 }
 351
 352 define amdgpu_kernel void @uniform_fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in, i32 %idx) {
 353 ; GCN-LABEL: name:            uniform_fneg_fabs_v2f32
 354 ; GCN-LABEL: bb.0 (%ir-block.0)
 355 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
 356 ; GCN: S_OR_B32 killed %{{[0-9]+}}, %[[REG]]
 357 ; GCN: S_OR_B32 killed %{{[0-9]+}}, %[[REG]]
 358
 359   %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx
 360   %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx
 361   %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4
 362   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %val)
 363   %fneg = fneg <2 x float> %fabs
 364   store <2 x float> %fneg, <2 x float> addrspace(1)* %gep.out
 365   ret void
 366 }
 367
 368 define amdgpu_kernel void @divergent_fneg_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 369 ; GCN-LABEL: name:            divergent_fneg_f64
 370 ; GCN-LABEL: bb.0 (%ir-block.0)
 371 ; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64
 372 ; FP16: %[[VREG64:[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR
 373 ; GCN: %[[HI32:[0-9]+]]:vgpr_32 = COPY %[[VREG64]].sub1
 374 ; GCN: %[[SREG_MASK:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
 375 ; GCN: %[[XOR:[0-9]+]]:vgpr_32 = V_XOR_B32_e64 killed %[[SREG_MASK]], killed  %[[HI32]]
 376 ; GCN: %[[LO32:[0-9]+]]:vgpr_32 = COPY %[[VREG64]].sub0
 377 ; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[XOR]], %subreg.sub1
 378
 379
 380   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 381   %tid.ext = sext i32 %tid to i64
 382   %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %tid.ext
 383   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
 384   %val = load volatile double, double addrspace(1)* %in.gep
 385   %fneg = fneg double %val
 386   store double %fneg, double addrspace(1)* %out.gep
 387   ret void
 388 }
 389
 390 define amdgpu_kernel void @uniform_fneg_f64(double addrspace(1)* %out, double addrspace(1)* %in, i64 %idx) {
 391 ; GCN-LABEL: name:            uniform_fneg_f64
 392 ; GCN-LABEL: bb.0 (%ir-block.0)
 393 ; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64
 394 ; FP16: %[[VREG64:[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR
 395 ; GCN: %[[LO32:[0-9]+]]:sreg_32 = COPY %[[VREG64]].sub0
 396 ; GCN: %[[HI32:[0-9]+]]:sreg_32 = COPY %[[VREG64]].sub1
 397 ; GCN: %[[SREG_MASK:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
 398 ; GCN: %[[XOR:[0-9]+]]:sreg_32 = S_XOR_B32 killed %[[HI32]], killed %[[SREG_MASK]]
 399 ; GCN: %[[XOR_COPY:[0-9]+]]:sreg_32 = COPY %[[XOR]]
 400 ; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[XOR_COPY]], %subreg.sub1
 401
 402   %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %idx
 403   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %idx
 404   %val = load volatile double, double addrspace(1)* %in.gep
 405   %fneg = fneg double %val
 406   store double %fneg, double addrspace(1)* %out.gep
 407   ret void
 408 }
 409
 410 define amdgpu_kernel void @divergent_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 411 ; GCN-LABEL: name:            divergent_fabs_f64
 412 ; GCN-LABEL: bb.0 (%ir-block.0)
 413 ; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64
 414 ; FP16: %[[VREG64:[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR
 415 ; GCN: %[[HI32:[0-9]+]]:vgpr_32 = COPY %[[VREG64]].sub1
 416 ; GCN: %[[SREG_MASK:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
 417 ; GCN: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed %[[SREG_MASK]], killed  %[[HI32]]
 418 ; GCN: %[[LO32:[0-9]+]]:vgpr_32 = COPY %[[VREG64]].sub0
 419 ; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[AND]], %subreg.sub1
 420
 421
 422   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 423   %tid.ext = sext i32 %tid to i64
 424   %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %tid.ext
 425   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
 426   %val = load volatile double, double addrspace(1)* %in.gep
 427   %fabs = call double @llvm.fabs.f64(double %val)
 428   store double %fabs, double addrspace(1)* %out.gep
 429   ret void
 430 }
 431
 432 define amdgpu_kernel void @uniform_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in, i64 %idx) {
 433 ; GCN-LABEL: name:            uniform_fabs_f64
 434 ; GCN-LABEL: bb.0 (%ir-block.0)
 435 ; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64
 436 ; FP16: %[[VREG64:[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR
 437 ; GCN: %[[LO32:[0-9]+]]:sreg_32 = COPY %[[VREG64]].sub0
 438 ; GCN: %[[HI32:[0-9]+]]:sreg_32 = COPY %[[VREG64]].sub1
 439 ; GCN: %[[SREG_MASK:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
 440 ; GCN: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %[[HI32]], killed %[[SREG_MASK]]
 441 ; GCN: %[[AND_COPY:[0-9]+]]:sreg_32 = COPY %[[AND]]
 442 ; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[AND_COPY]], %subreg.sub1
 443
 444
 445   %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %idx
 446   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %idx
 447   %val = load volatile double, double addrspace(1)* %in.gep
 448   %fabs = call double @llvm.fabs.f64(double %val)
 449   store double %fabs, double addrspace(1)* %out.gep
 450   ret void
 451 }
 452
 453 define amdgpu_kernel void @divergent_fneg_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 454 ; GCN-LABEL: name:            divergent_fneg_fabs_f64
 455 ; GCN-LABEL: bb.0 (%ir-block.0)
 456 ; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64
 457 ; FP16: %[[VREG64:[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR
 458 ; GCN: %[[HI32:[0-9]+]]:vgpr_32 = COPY %[[VREG64]].sub1
 459 ; GCN: %[[SREG_MASK:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
 460 ; GCN: %[[OR:[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed %[[SREG_MASK]], killed  %[[HI32]]
 461 ; GCN: %[[LO32:[0-9]+]]:vgpr_32 = COPY %[[VREG64]].sub0
 462 ; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[OR]], %subreg.sub1
 463
 464
 465   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 466   %tid.ext = sext i32 %tid to i64
 467   %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %tid.ext
 468   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
 469   %val = load volatile double, double addrspace(1)* %in.gep
 470   %fabs = call double @llvm.fabs.f64(double %val)
 471   %fneg = fneg double %fabs
 472   store double %fneg, double addrspace(1)* %out.gep
 473   ret void
 474 }
 475
 476 define amdgpu_kernel void @uniform_fneg_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in, i64 %idx) {
 477 ; GCN-LABEL: name:            uniform_fneg_fabs_f64
 478 ; GCN-LABEL: bb.0 (%ir-block.0)
 479 ; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64
 480 ; FP16: %[[VREG64:[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR
 481 ; GCN: %[[LO32:[0-9]+]]:sreg_32 = COPY %[[VREG64]].sub0
 482 ; GCN: %[[HI32:[0-9]+]]:sreg_32 = COPY %[[VREG64]].sub1
 483 ; GCN: %[[SREG_MASK:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
 484 ; GCN: %[[OR:[0-9]+]]:sreg_32 = S_OR_B32 killed %[[HI32]], killed %[[SREG_MASK]]
 485 ; GCN: %[[OR_COPY:[0-9]+]]:sreg_32 = COPY %[[OR]]
 486 ; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[OR_COPY]], %subreg.sub1
 487
 488
 489   %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %idx
 490   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %idx
 491   %val = load volatile double, double addrspace(1)* %in.gep
 492   %fabs = call double @llvm.fabs.f64(double %val)
 493   %fneg = fneg double %fabs
 494   store double %fneg, double addrspace(1)* %out.gep
 495   ret void
 496 }
 497
 498 declare float @llvm.fabs.f32(float)
 499 declare half @llvm.fabs.f16(half)
 500 declare double @llvm.fabs.f64(double)
 501 declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
 502 declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
 503
 504 declare i32 @llvm.amdgcn.workitem.id.x()