llvm/test/CodeGen/AMDGPU/operand-folding.ll

   1 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
   2 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -early-live-intervals < %s | FileCheck %s
   3
   4 ; CHECK-LABEL: {{^}}fold_sgpr:
   5 ; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s
   6 define amdgpu_kernel void @fold_sgpr(ptr addrspace(1) %out, i32 %fold) #1 {
   7 entry:
   8   %tmp0 = icmp ne i32 %fold, 0
   9   br i1 %tmp0, label %if, label %endif
  10
  11 if:
  12   %id = call i32 @llvm.amdgcn.workitem.id.x()
  13   %offset = add i32 %fold, %id
  14   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %offset
  15   store i32 0, ptr addrspace(1) %tmp1
  16   br label %endif
  17
  18 endif:
  19   ret void
  20 }
  21
  22 ; CHECK-LABEL: {{^}}fold_imm:
  23 ; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5
  24 define amdgpu_kernel void @fold_imm(ptr addrspace(1) %out, i32 %cmp) #1 {
  25 entry:
  26   %fold = add i32 3, 2
  27   %tmp0 = icmp ne i32 %cmp, 0
  28   br i1 %tmp0, label %if, label %endif
  29
  30 if:
  31   %id = call i32 @llvm.amdgcn.workitem.id.x()
  32   %val = or i32 %id, %fold
  33   store i32 %val, ptr addrspace(1) %out
  34   br label %endif
  35
  36 endif:
  37   ret void
  38 }
  39
  40 ; CHECK-LABEL: {{^}}fold_64bit_constant_add:
  41 ; CHECK-NOT: s_mov_b64
  42 ; FIXME: It would be better if we could use v_add here and drop the extra
  43 ; v_mov_b32 instructions.
  44 ; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1
  45 ; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0
  46 ; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]
  47 ; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
  48 ; CHECK: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]],
  49
  50 define amdgpu_kernel void @fold_64bit_constant_add(ptr addrspace(1) %out, i32 %cmp, i64 %val) #1 {
  51 entry:
  52   %tmp0 = add i64 %val, 1
  53   store i64 %tmp0, ptr addrspace(1) %out
  54   ret void
  55 }
  56
  57 ; Inline constants should always be folded.
  58
  59 ; CHECK-LABEL: {{^}}vector_inline:
  60 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
  61 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
  62 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
  63 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
  64
  65 define amdgpu_kernel void @vector_inline(ptr addrspace(1) %out) #1 {
  66 entry:
  67   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
  68   %tmp1 = add i32 %tmp0, 1
  69   %tmp2 = add i32 %tmp0, 2
  70   %tmp3 = add i32 %tmp0, 3
  71   %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
  72   %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
  73   %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
  74   %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
  75   %tmp4 = xor <4 x i32> <i32 5, i32 5, i32 5, i32 5>, %vec3
  76   store <4 x i32> %tmp4, ptr addrspace(1) %out
  77   ret void
  78 }
  79
  80 ; Immediates with one use should be folded
  81 ; CHECK-LABEL: {{^}}imm_one_use:
  82 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}
  83
  84 define amdgpu_kernel void @imm_one_use(ptr addrspace(1) %out) #1 {
  85 entry:
  86   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
  87   %tmp1 = xor i32 %tmp0, 100
  88   store i32 %tmp1, ptr addrspace(1) %out
  89   ret void
  90 }
  91 ; CHECK-LABEL: {{^}}vector_imm:
  92 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}}
  93 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}}
  94 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}}
  95 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}}
  96
  97 define amdgpu_kernel void @vector_imm(ptr addrspace(1) %out) #1 {
  98 entry:
  99   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
 100   %tmp1 = add i32 %tmp0, 1
 101   %tmp2 = add i32 %tmp0, 2
 102   %tmp3 = add i32 %tmp0, 3
 103   %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
 104   %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
 105   %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
 106   %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
 107   %tmp4 = xor <4 x i32> <i32 100, i32 100, i32 100, i32 100>, %vec3
 108   store <4 x i32> %tmp4, ptr addrspace(1) %out
 109   ret void
 110 }
 111
 112 ; A subregister use operand should not be tied.
 113 ; CHECK-LABEL: {{^}}no_fold_tied_subregister:
 114 ; CHECK: buffer_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 115 ; CHECK: v_madmk_f32 v[[RES:[0-9]+]], v[[HI]], 0x41200000, v[[LO]]
 116 ; CHECK: buffer_store_dword v[[RES]]
 117 define amdgpu_kernel void @no_fold_tied_subregister() #1 {
 118   %tmp1 = load volatile <2 x float>, ptr addrspace(1) undef
 119   %tmp2 = extractelement <2 x float> %tmp1, i32 0
 120   %tmp3 = extractelement <2 x float> %tmp1, i32 1
 121   %tmp4 = fmul float %tmp3, 10.0
 122   %tmp5 = fadd float %tmp4, %tmp2
 123   store volatile float %tmp5, ptr addrspace(1) undef
 124   ret void
 125 }
 126
 127 ; There should be exact one folding on the same operand.
 128 ; CHECK-LABEL: {{^}}no_extra_fold_on_same_opnd
 129 ; CHECK-NOT: %bb.1:
 130 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 131 define void @no_extra_fold_on_same_opnd() #1 {
 132 entry:
 133   %s0 = load i32, ptr addrspace(5) undef, align 4
 134   %s0.i64= zext i32 %s0 to i64
 135   br label %for.body.i.i
 136
 137 for.body.i.i:
 138   %s1 = load i32, ptr addrspace(1) undef, align 8
 139   %s1.i64 = sext i32 %s1 to i64
 140   %xor = xor i64 %s1.i64, %s0.i64
 141   %flag = icmp ult i64 %xor, 8
 142   br i1 %flag, label %if.then, label %if.else
 143
 144 if.then:
 145   unreachable
 146
 147 if.else:
 148   unreachable
 149 }
 150
 151 declare i32 @llvm.amdgcn.workitem.id.x() #0
 152
 153 attributes #0 = { nounwind readnone }
 154 attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }