llvm/test/CodeGen/AMDGPU/valu-i1.ll

   1 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose -disable-block-placement -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s
   2
   3 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   4
   5 ; SI-LABEL: {{^}}test_if:
   6 ; Make sure the i1 values created by the cfg structurizer pass are
   7 ; moved using VALU instructions
   8
   9
  10 ; waitcnt should be inserted after exec modification
  11 ; SI:      v_cmp_lt_i32_e32 vcc, 1,
  12 ; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
  13 ; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
  14 ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
  15 ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
  16 ; SI-NEXT: s_cbranch_execz [[FLOW_BB:.LBB[0-9]+_[0-9]+]]
  17
  18 ; SI-NEXT: ; %bb.{{[0-9]+}}: ; %LeafBlock3
  19 ; SI:      s_mov_b64 s[{{[0-9]:[0-9]}}], -1
  20 ; SI:      s_and_saveexec_b64
  21 ; SI-NEXT: s_cbranch_execnz
  22
  23 ; v_mov should be after exec modification
  24 ; SI: [[FLOW_BB]]:
  25 ; SI-NEXT: s_andn2_saveexec_b64 [[SAVE2]], [[SAVE2]]
  26 ;
  27 define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
  28 entry:
  29   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  30   switch i32 %tid, label %default [
  31     i32 1, label %case1
  32     i32 2, label %case2
  33   ]
  34
  35 case1:
  36   %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
  37   store i32 13, i32 addrspace(1)* %arrayidx1, align 4
  38   br label %end
  39
  40 case2:
  41   %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
  42   store i32 17, i32 addrspace(1)* %arrayidx5, align 4
  43   br label %end
  44
  45 default:
  46   %cmp8 = icmp eq i32 %tid, 2
  47   %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
  48   br i1 %cmp8, label %if, label %else
  49
  50 if:
  51   store i32 19, i32 addrspace(1)* %arrayidx10, align 4
  52   br label %end
  53
  54 else:
  55   store i32 21, i32 addrspace(1)* %arrayidx10, align 4
  56   br label %end
  57
  58 end:
  59   ret void
  60 }
  61
  62 ; SI-LABEL: {{^}}simple_test_v_if:
  63 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
  64 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
  65 ; SI-NEXT: s_cbranch_execz [[EXIT:.LBB[0-9]+_[0-9]+]]
  66
  67 ; SI-NEXT: ; %bb.{{[0-9]+}}:
  68 ; SI: buffer_store_dword
  69
  70 ; SI-NEXT: {{^}}[[EXIT]]:
  71 ; SI: s_endpgm
  72 define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
  73   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  74   %is.0 = icmp ne i32 %tid, 0
  75   br i1 %is.0, label %then, label %exit
  76
  77 then:
  78   %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
  79   store i32 999, i32 addrspace(1)* %gep
  80   br label %exit
  81
  82 exit:
  83   ret void
  84 }
  85
  86 ; FIXME: It would be better to endpgm in the then block.
  87
  88 ; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
  89 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
  90 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
  91 ; SI-NEXT: s_cbranch_execz [[EXIT:.LBB[0-9]+_[0-9]+]]
  92
  93 ; SI-NEXT: ; %bb.{{[0-9]+}}:
  94 ; SI: buffer_store_dword
  95
  96 ; SI-NEXT: {{^}}[[EXIT]]:
  97 ; SI: s_endpgm
  98 define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
  99   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 100   %is.0 = icmp ne i32 %tid, 0
 101   br i1 %is.0, label %then, label %exit
 102
 103 then:
 104   %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
 105   store i32 999, i32 addrspace(1)* %gep
 106   ret void
 107
 108 exit:
 109   ret void
 110 }
 111
 112 ; Final block has more than a ret to execute. This was miscompiled
 113 ; before function exit blocks were unified since the endpgm would
 114 ; terminate the then wavefront before reaching the store.
 115
 116 ; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret:
 117 ; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
 118 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 119 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
 120 ; SI: s_cbranch_execnz [[EXIT:.LBB[0-9]+_[0-9]+]]
 121
 122 ; SI-NEXT: {{^.LBB[0-9]+_[0-9]+}}: ; %Flow
 123 ; SI-NEXT: s_andn2_saveexec_b64 [[BR_SREG]], [[BR_SREG]]
 124 ; SI-NEXT: s_cbranch_execz [[UNIFIED_RETURN:.LBB[0-9]+_[0-9]+]]
 125
 126 ; SI-NEXT: ; %bb.{{[0-9]+}}: ; %then
 127 ; SI: s_waitcnt
 128 ; SI-NEXT: buffer_store_dword
 129
 130 ; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
 131 ; SI: s_endpgm
 132
 133 ; SI-NEXT: {{^}}[[EXIT]]:
 134 ; SI: ds_write_b32
 135 define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
 136   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 137   %is.0 = icmp ne i32 %tid, 0
 138   br i1 %is.0, label %then, label %exit
 139
 140 then:
 141   %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
 142   store i32 999, i32 addrspace(1)* %gep
 143   ret void
 144
 145 exit:
 146   store volatile i32 7, i32 addrspace(3)* undef
 147   ret void
 148 }
 149
 150 ; SI-LABEL: {{^}}simple_test_v_loop:
 151 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 152 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 153 ; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:.LBB[0-9]+_[0-9]+]]
 154
 155 ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 156
 157 ; SI: [[LABEL_LOOP:.LBB[0-9]+_[0-9]+]]:
 158 ; SI: buffer_load_dword
 159 ; SI-DAG: buffer_store_dword
 160 ; SI-DAG: s_cmpk_lg_i32 s{{[0-9]+}}, 0x100
 161 ; SI: s_cbranch_scc1 [[LABEL_LOOP]]
 162 ; SI: [[LABEL_EXIT]]:
 163 ; SI: s_endpgm
 164 define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
 165 entry:
 166   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 167   %is.0 = icmp ne i32 %tid, 0
 168   %limit = add i32 %tid, 64
 169   br i1 %is.0, label %loop, label %exit
 170
 171 loop:
 172   %i = phi i32 [%tid, %entry], [%i.inc, %loop]
 173   %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i
 174   %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i
 175   %load = load i32, i32 addrspace(1)* %src
 176   store i32 %load, i32 addrspace(1)* %gep.dst
 177   %i.inc = add nsw i32 %i, 1
 178   %cmp = icmp eq i32 %limit, %i.inc
 179   br i1 %cmp, label %exit, label %loop
 180
 181 exit:
 182   ret void
 183 }
 184
 185 ; SI-LABEL: {{^}}multi_vcond_loop:
 186
 187 ; Load loop limit from buffer
 188 ; Branch to exit if uniformly not taken
 189 ; SI: ; %bb.0:
 190 ; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
 191 ; SI: v_cmp_lt_i32_e32 vcc
 192 ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 193 ; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:.LBB[0-9]+_[0-9]+]]
 194
 195 ; Initialize inner condition to false
 196 ; SI: ; %bb.{{[0-9]+}}: ; %bb10.preheader
 197 ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 198
 199 ; Clear exec bits for workitems that load -1s
 200 ; SI: .L[[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
 201 ; SI: buffer_load_dword [[B:v[0-9]+]]
 202 ; SI: buffer_load_dword [[A:v[0-9]+]]
 203 ; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
 204 ; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
 205 ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
 206 ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
 207 ; SI: s_cbranch_execz [[LABEL_FLOW:.LBB[0-9]+_[0-9]+]]
 208
 209 ; SI: ; %bb.{{[0-9]+}}: ; %bb20
 210 ; SI: buffer_store_dword
 211
 212 ; SI: [[LABEL_FLOW]]:
 213 ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
 214 ; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
 215 ; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]],
 216 ; SI-NEXT: s_or_b64 [[COND_STATE]], [[TMP1]], [[COND_STATE]]
 217 ; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
 218 ; SI-NEXT: s_cbranch_execnz .L[[LABEL_LOOP]]
 219
 220 ; SI: [[LABEL_EXIT]]:
 221 ; SI-NOT: [[COND_STATE]]
 222 ; SI: s_endpgm
 223 define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
 224 bb:
 225   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
 226   %tmp4 = sext i32 %tmp to i64
 227   %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4
 228   %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
 229   %tmp7 = icmp sgt i32 %tmp6, 0
 230   %tmp8 = sext i32 %tmp6 to i64
 231   br i1 %tmp7, label %bb10, label %bb26
 232
 233 bb10:                                             ; preds = %bb, %bb20
 234   %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
 235   %tmp12 = add nsw i64 %tmp11, %tmp4
 236   %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12
 237   %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
 238   %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12
 239   %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
 240   %tmp17 = icmp ne i32 %tmp14, -1
 241   %tmp18 = icmp ne i32 %tmp16, -1
 242   %tmp19 = and i1 %tmp17, %tmp18
 243   br i1 %tmp19, label %bb20, label %bb26
 244
 245 bb20:                                             ; preds = %bb10
 246   %tmp21 = add nsw i32 %tmp16, %tmp14
 247   %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12
 248   store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
 249   %tmp23 = add nuw nsw i64 %tmp11, 1
 250   %tmp24 = icmp slt i64 %tmp23, %tmp8
 251   br i1 %tmp24, label %bb10, label %bb26
 252
 253 bb26:                                             ; preds = %bb10, %bb20, %bb
 254   ret void
 255 }
 256
 257 attributes #0 = { nounwind readnone }
 258 attributes #1 = { nounwind }